diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index e7db1ededf383..03397e1e0d89e 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -16,6 +16,7 @@ #include "RISCV.h" #include "RISCVFrameLowering.h" #include "RISCVTargetMachine.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/MacroFusion.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/MC/TargetRegistry.h" @@ -199,3 +200,15 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const { ? RISCVMinimumJumpTableEntries : TuneInfo->MinimumJumpTableEntries; } + +void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const { + // Do bidirectional scheduling since it provides a more balanced scheduling + // leading to better performance. This will increase compile time. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + + // Spilling is generally expensive on all RISC-V cores, so always enable + // register-pressure tracking. This will increase compile time. + Policy.ShouldTrackPressure = true; +} diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index f59a3737ae76f..f2c0a3d85c998 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -327,6 +327,9 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { unsigned getTailDupAggressiveThreshold() const { return TuneInfo->TailDupAggressiveThreshold; } + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const override; }; } // End llvm namespace diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll index 330f8b16065f1..ee414992a5245 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll @@ -25,8 +25,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) { ; RV32IM-LABEL: add_i8_signext_i32: ; RV32IM: # %bb.0: # %entry ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: slli a1, a1, 24 +; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret @@ -34,8 +34,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) { ; RV64IM-LABEL: add_i8_signext_i32: ; RV64IM: # %bb.0: # %entry ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: slli a1, a1, 56 +; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll index f33ba1d7a302e..bce6dfacf8e82 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll @@ -6,8 +6,8 @@ define i2 @bitreverse_i2(i2 %x) { ; RV32-LABEL: bitreverse_i2: ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 1 -; RV32-NEXT: andi a1, a1, 2 ; RV32-NEXT: andi a0, a0, 3 +; RV32-NEXT: andi a1, a1, 2 ; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: ret @@ -15,8 +15,8 @@ define i2 @bitreverse_i2(i2 %x) { ; RV64-LABEL: bitreverse_i2: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: andi a0, a0, 3 +; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: ret @@ -28,8 +28,8 @@ define i3 @bitreverse_i3(i3 %x) { ; RV32-LABEL: bitreverse_i3: ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 2 -; RV32-NEXT: andi a1, a1, 4 ; RV32-NEXT: andi a0, a0, 7 +; RV32-NEXT: andi a1, a1, 4 ; RV32-NEXT: andi a2, a0, 2 ; RV32-NEXT: or a1, a1, a2 ; RV32-NEXT: srli a0, a0, 2 @@ -39,8 +39,8 @@ define i3 @bitreverse_i3(i3 %x) { ; RV64-LABEL: bitreverse_i3: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 2 -; RV64-NEXT: andi a1, a1, 4 ; RV64-NEXT: andi a0, a0, 7 +; RV64-NEXT: andi a1, a1, 4 ; RV64-NEXT: andi a2, a0, 2 ; RV64-NEXT: or a1, a1, a2 ; RV64-NEXT: srli a0, a0, 2 @@ -54,11 +54,11 @@ define i4 @bitreverse_i4(i4 %x) { ; RV32-LABEL: bitreverse_i4: ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 3 -; RV32-NEXT: andi a1, a1, 8 ; RV32-NEXT: slli a2, a0, 1 +; RV32-NEXT: andi a0, a0, 15 +; RV32-NEXT: andi a1, a1, 8 ; RV32-NEXT: andi a2, a2, 4 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: andi a0, a0, 15 ; RV32-NEXT: srli a2, a0, 1 ; RV32-NEXT: andi a2, a2, 2 ; RV32-NEXT: or a1, a1, a2 @@ -69,11 +69,11 @@ define i4 @bitreverse_i4(i4 %x) { ; RV64-LABEL: bitreverse_i4: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: andi a1, a1, 8 ; RV64-NEXT: slli a2, a0, 1 +; RV64-NEXT: andi a0, a0, 15 +; RV64-NEXT: andi a1, a1, 8 ; RV64-NEXT: andi a2, a2, 4 ; RV64-NEXT: or a1, a1, a2 -; RV64-NEXT: andi a0, a0, 15 ; RV64-NEXT: srli a2, a0, 1 ; RV64-NEXT: andi a2, a2, 2 ; RV64-NEXT: or a1, a1, a2 @@ -88,21 +88,21 @@ define i7 @bitreverse_i7(i7 %x) { ; RV32-LABEL: bitreverse_i7: ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 6 -; RV32-NEXT: andi a1, a1, 64 ; RV32-NEXT: slli a2, a0, 4 +; RV32-NEXT: slli a3, a0, 2 +; RV32-NEXT: andi a0, a0, 127 +; RV32-NEXT: andi a1, a1, 64 ; RV32-NEXT: andi a2, a2, 32 +; RV32-NEXT: andi a3, a3, 16 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: slli a2, a0, 2 -; RV32-NEXT: andi a2, a2, 16 -; RV32-NEXT: andi a0, a0, 127 -; RV32-NEXT: andi a3, a0, 8 -; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: andi a2, a0, 8 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: srli a3, a0, 2 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: srli a2, a0, 2 -; RV32-NEXT: andi a2, a2, 4 -; RV32-NEXT: srli a3, a0, 4 -; RV32-NEXT: andi a3, a3, 2 -; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: srli a2, a0, 4 +; RV32-NEXT: andi a3, a3, 4 +; RV32-NEXT: andi a2, a2, 2 +; RV32-NEXT: or a2, a3, a2 ; RV32-NEXT: or a1, a1, a2 ; RV32-NEXT: srli a0, a0, 6 ; RV32-NEXT: or a0, a1, a0 @@ -111,21 +111,21 @@ define i7 @bitreverse_i7(i7 %x) { ; RV64-LABEL: bitreverse_i7: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 6 -; RV64-NEXT: andi a1, a1, 64 ; RV64-NEXT: slli a2, a0, 4 +; RV64-NEXT: slli a3, a0, 2 +; RV64-NEXT: andi a0, a0, 127 +; RV64-NEXT: andi a1, a1, 64 ; RV64-NEXT: andi a2, a2, 32 +; RV64-NEXT: andi a3, a3, 16 ; RV64-NEXT: or a1, a1, a2 -; RV64-NEXT: slli a2, a0, 2 -; RV64-NEXT: andi a2, a2, 16 -; RV64-NEXT: andi a0, a0, 127 -; RV64-NEXT: andi a3, a0, 8 -; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: andi a2, a0, 8 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: srli a3, a0, 2 ; RV64-NEXT: or a1, a1, a2 -; RV64-NEXT: srli a2, a0, 2 -; RV64-NEXT: andi a2, a2, 4 -; RV64-NEXT: srli a3, a0, 4 -; RV64-NEXT: andi a3, a3, 2 -; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: srli a2, a0, 4 +; RV64-NEXT: andi a3, a3, 4 +; RV64-NEXT: andi a2, a2, 2 +; RV64-NEXT: or a2, a3, a2 ; RV64-NEXT: or a1, a1, a2 ; RV64-NEXT: srli a0, a0, 6 ; RV64-NEXT: or a0, a1, a0 @@ -139,33 +139,33 @@ define i24 @bitreverse_i24(i24 %x) { ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 16 ; RV32-NEXT: lui a2, 4096 +; RV32-NEXT: lui a3, 1048335 ; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: addi a3, a3, 240 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: srli a0, a0, 16 ; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: lui a1, 1048335 -; RV32-NEXT: addi a1, a1, 240 -; RV32-NEXT: and a3, a1, a2 -; RV32-NEXT: and a3, a0, a3 -; RV32-NEXT: srli a3, a3, 4 +; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a1, a0, a1 ; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: or a0, a3, a0 -; RV32-NEXT: lui a1, 1047757 -; RV32-NEXT: addi a1, a1, -820 -; RV32-NEXT: and a3, a1, a2 -; RV32-NEXT: and a3, a0, a3 -; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: lui a3, 1047757 +; RV32-NEXT: addi a3, a3, -820 +; RV32-NEXT: srli a1, a1, 4 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a1, a0, a1 ; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: or a0, a3, a0 -; RV32-NEXT: lui a1, 1047211 -; RV32-NEXT: addi a1, a1, -1366 -; RV32-NEXT: and a2, a1, a2 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: lui a3, 1047211 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: and a2, a0, a2 -; RV32-NEXT: srli a2, a2, 1 ; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: or a0, a2, a0 ; RV32-NEXT: ret ; @@ -173,33 +173,33 @@ define i24 @bitreverse_i24(i24 %x) { ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 16 ; RV64-NEXT: lui a2, 4096 +; RV64-NEXT: lui a3, 1048335 ; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: addiw a3, a3, 240 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: srli a0, a0, 16 ; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: lui a1, 1048335 -; RV64-NEXT: addiw a1, a1, 240 -; RV64-NEXT: and a3, a1, a2 -; RV64-NEXT: and a3, a0, a3 -; RV64-NEXT: srli a3, a3, 4 +; RV64-NEXT: and a1, a3, a2 +; RV64-NEXT: and a1, a0, a1 ; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: or a0, a3, a0 -; RV64-NEXT: lui a1, 1047757 -; RV64-NEXT: addiw a1, a1, -820 -; RV64-NEXT: and a3, a1, a2 -; RV64-NEXT: and a3, a0, a3 -; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: lui a3, 1047757 +; RV64-NEXT: addiw a3, a3, -820 +; RV64-NEXT: srli a1, a1, 4 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: and a1, a3, a2 +; RV64-NEXT: and a1, a0, a1 ; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: or a0, a3, a0 -; RV64-NEXT: lui a1, 1047211 -; RV64-NEXT: addiw a1, a1, -1366 -; RV64-NEXT: and a2, a1, a2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: lui a3, 1047211 +; RV64-NEXT: addiw a3, a3, -1366 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: and a2, a0, a2 -; RV64-NEXT: srli a2, a2, 1 ; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: srli a2, a2, 1 +; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: or a0, a2, a0 ; RV64-NEXT: ret %rev = call i24 @llvm.bitreverse.i24(i24 %x) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll index 70d1b25309c84..cf7cef83bcc13 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll @@ -21,34 +21,34 @@ define void @constant_fold_barrier_i128(ptr %p) { ; RV32-LABEL: constant_fold_barrier_i128: ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a1, 1 -; RV32-NEXT: slli a1, a1, 11 ; RV32-NEXT: lw a2, 0(a0) ; RV32-NEXT: lw a3, 4(a0) ; RV32-NEXT: lw a4, 8(a0) ; RV32-NEXT: lw a5, 12(a0) +; RV32-NEXT: slli a1, a1, 11 ; RV32-NEXT: and a2, a2, a1 ; RV32-NEXT: and a3, a3, zero ; RV32-NEXT: and a4, a4, zero ; RV32-NEXT: and a5, a5, zero ; RV32-NEXT: add a2, a2, a1 -; RV32-NEXT: sltu a1, a2, a1 ; RV32-NEXT: add a6, a3, zero +; RV32-NEXT: sltu a1, a2, a1 ; RV32-NEXT: sltu a3, a6, a3 ; RV32-NEXT: add a6, a6, a1 ; RV32-NEXT: seqz a7, a6 ; RV32-NEXT: and a1, a7, a1 +; RV32-NEXT: add a7, a4, zero +; RV32-NEXT: add a5, a5, zero +; RV32-NEXT: sltu a4, a7, a4 ; RV32-NEXT: or a1, a3, a1 -; RV32-NEXT: add a3, a4, zero -; RV32-NEXT: sltu a4, a3, a4 -; RV32-NEXT: add a3, a3, a1 -; RV32-NEXT: seqz a7, a3 -; RV32-NEXT: and a1, a7, a1 +; RV32-NEXT: add a7, a7, a1 +; RV32-NEXT: seqz a3, a7 +; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: or a1, a4, a1 -; RV32-NEXT: add a5, a5, zero ; RV32-NEXT: add a1, a5, a1 ; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a6, 4(a0) -; RV32-NEXT: sw a3, 8(a0) +; RV32-NEXT: sw a7, 8(a0) ; RV32-NEXT: sw a1, 12(a0) ; RV32-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll index 51e8b6da39d09..2c3e3faddc391 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll @@ -21,9 +21,9 @@ define i128 @constant_fold_barrier_i128(i128 %x) { ; RV64-LABEL: constant_fold_barrier_i128: ; RV64: # %bb.0: # %entry ; RV64-NEXT: li a2, 1 +; RV64-NEXT: and a1, a1, zero ; RV64-NEXT: slli a2, a2, 11 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: and a1, a1, zero ; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: sltu a2, a0, a2 ; RV64-NEXT: add a1, a1, zero diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll index 05989c310541b..1156edffe9194 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll @@ -117,8 +117,8 @@ define i64 @abs64(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a2, a1, 31 ; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: sltu a3, a0, a2 ; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: sltu a3, a0, a2 ; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: xor a1, a1, a2 @@ -128,8 +128,8 @@ define i64 @abs64(i64 %x) { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: srai a2, a1, 31 ; RV32ZBB-NEXT: add a0, a0, a2 -; RV32ZBB-NEXT: sltu a3, a0, a2 ; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: sltu a3, a0, a2 ; RV32ZBB-NEXT: add a1, a1, a3 ; RV32ZBB-NEXT: xor a0, a0, a2 ; RV32ZBB-NEXT: xor a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll index c558639fda424..68bf9240ccd1d 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll @@ -302,8 +302,8 @@ define i64 @rori_i64(i64 %a) nounwind { ; CHECK-NEXT: slli a2, a0, 31 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: slli a3, a1, 31 -; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll index 1184905c17ede..7f22127ad3536 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll @@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 16 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 ; RV32I-NEXT: lui a2, 209715 ; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 16 @@ -63,11 +63,11 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a6, 61681 ; RV32I-NEXT: addi a5, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a4, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a3, a2, -241 +; RV32I-NEXT: addi a4, a3, 819 +; RV32I-NEXT: addi a3, a6, -241 ; RV32I-NEXT: li a2, 32 ; RV32I-NEXT: beqz a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: @@ -155,22 +155,22 @@ define i32 @cttz_i32(i32 %a) nounwind { ; RV32I-NEXT: # %bb.1: # %cond.false ; RV32I-NEXT: not a1, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 ; RV32I-NEXT: lui a2, 209715 ; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 61681 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -196,11 +196,11 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-LABEL: cttz_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: beqz a0, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: not a1, a0 @@ -271,17 +271,17 @@ define i32 @ctpop_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: lui a2, 209715 ; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 61681 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -305,39 +305,39 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: li a1, 0 @@ -364,39 +364,39 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { ; RV32I-NEXT: .LBB6_2: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: sltiu a0, a0, 2 @@ -429,39 +429,39 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { ; RV32I-NEXT: .LBB7_2: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: sltiu a0, a0, 2 @@ -491,39 +491,39 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: xori a0, a0, 1 @@ -548,39 +548,39 @@ define i1 @ctpop_i64_ne_one(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: and a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: and a3, a3, a4 ; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: xori a0, a0, 1 @@ -872,8 +872,8 @@ define i64 @abs_i64(i64 %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: srai a2, a1, 31 ; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: xor a0, a0, a2 ; CHECK-NEXT: xor a1, a1, a2 @@ -923,15 +923,15 @@ define i32 @bswap_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 24 ; RV32I-NEXT: srli a2, a0, 24 +; RV32I-NEXT: lui a3, 16 ; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -256 -; RV32I-NEXT: and a3, a0, a2 -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: addi a3, a3, -256 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: bswap_i32: @@ -949,25 +949,24 @@ define i64 @bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a1, 24 ; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: lui a4, 16 +; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: slli a6, a0, 24 ; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: lui a3, 16 -; RV32I-NEXT: addi a3, a3, -256 -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: or a2, a1, a4 -; RV32I-NEXT: slli a1, a0, 24 -; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: and a4, a0, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: or a1, a0, a4 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: srli a3, a0, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: srli a6, a0, 8 +; RV32I-NEXT: addi a4, a4, -256 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a5, a5, a4 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: and a4, a6, a4 +; RV32I-NEXT: or a2, a2, a5 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: slli a5, a0, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: or a0, a2, a1 +; RV32I-NEXT: or a1, a3, a5 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: bswap_i64: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll index 80e43c94aab0e..a647eae82dddf 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll @@ -111,8 +111,8 @@ define i32 @packh_i32(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: packh_i32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addi a2, a2, -256 ; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: addi a2, a2, -256 ; CHECK-NEXT: slli a1, a1, 8 ; CHECK-NEXT: and a1, a1, a2 ; CHECK-NEXT: or a0, a1, a0 @@ -149,8 +149,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: packh_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addi a1, a1, -256 ; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: addi a1, a1, -256 ; CHECK-NEXT: slli a2, a2, 8 ; CHECK-NEXT: and a1, a2, a1 ; CHECK-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll index 2bd0c78659b00..9584270d8e66f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll @@ -1025,8 +1025,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: pack_i64_2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a1, a0 @@ -1337,8 +1337,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) { ; RV64I-LABEL: array_index_lshr_sh3_sh3: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a1, 58 -; RV64I-NEXT: slli a1, a1, 6 ; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ld a0, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 03f8eff90c23b..961811d3b623c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -17,32 +17,32 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -75,32 +75,32 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a1, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -142,32 +142,32 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a2, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -202,35 +202,35 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: li s0, -1 ; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: addiw a2, a3, 1365 +; RV64I-NEXT: srliw a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srliw a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srliw a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srliw a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srliw a3, a0, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lui a2, 4112 ; RV64I-NEXT: srli s1, a1, 32 -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addiw a1, a3, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: addiw a1, a2, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: beqz s1, .LBB3_2 ; RV64I-NEXT: # %bb.1: @@ -276,32 +276,32 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: srliw a0, a0, 2 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -334,29 +334,27 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 21845 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addi a1, a2, 1365 +; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: srli a2, a0, 8 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: srli a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srli a2, a0, 32 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 13107 ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 @@ -365,20 +363,22 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 3855 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, -241 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -414,24 +414,24 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: not a1, a0 ; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -457,24 +457,24 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: not a1, a0 ; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -501,24 +501,24 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: li s0, -1 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: addi a1, s1, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: slli s1, s1, 32 @@ -560,35 +560,35 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: addi a1, s0, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: slli s0, s0, 32 ; RV64I-NEXT: srli s0, s0, 32 -; RV64I-NEXT: mv a1, a0 -; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: beqz s0, .LBB9_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: srliw a0, a1, 24 -; RV64I-NEXT: addiw a0, a0, 1 +; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: addiw a1, a1, 1 ; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 @@ -622,19 +622,17 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: not a1, a0 ; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 21845 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: addi a1, a2, 1365 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, 1365 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 13107 ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 @@ -643,19 +641,21 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 3855 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -688,19 +688,19 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -725,19 +725,19 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -762,23 +762,23 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lwu a0, 0(a0) -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: addiw a1, a1, 257 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 @@ -811,8 +811,6 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: lui a2, 13107 ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 @@ -821,19 +819,21 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 3855 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -998,8 +998,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: minu_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a0, 32 -; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: srli a3, a3, 32 ; RV64I-NEXT: bltu a2, a3, .LBB23_2 ; RV64I-NEXT: # %bb.1: @@ -1011,8 +1011,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64ZBB-LABEL: minu_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 32 -; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: minu a0, a0, a1 ; RV64ZBB-NEXT: sext.w a0, a0 @@ -1046,8 +1046,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: maxu_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a0, 32 -; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: srli a3, a3, 32 ; RV64I-NEXT: bltu a3, a2, .LBB25_2 ; RV64I-NEXT: # %bb.1: @@ -1059,8 +1059,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64ZBB-LABEL: maxu_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 32 -; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: maxu a0, a0, a1 ; RV64ZBB-NEXT: sext.w a0, a0 @@ -1186,15 +1186,15 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: slliw a1, a0, 24 ; RV64I-NEXT: srliw a2, a0, 24 +; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: or a1, a2, a1 -; RV64I-NEXT: lui a2, 16 -; RV64I-NEXT: addiw a2, a2, -256 -; RV64I-NEXT: and a3, a0, a2 -; RV64I-NEXT: slliw a3, a3, 8 -; RV64I-NEXT: srliw a0, a0, 8 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: addiw a3, a3, -256 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: slliw a0, a0, 8 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: bswap_i32: @@ -1213,15 +1213,15 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a0, 24 ; RV64I-NEXT: srliw a3, a0, 24 +; RV64I-NEXT: lui a4, 16 ; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: lui a3, 16 -; RV64I-NEXT: addi a3, a3, -256 -; RV64I-NEXT: and a4, a0, a3 -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: srliw a0, a0, 8 -; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srliw a3, a0, 8 +; RV64I-NEXT: addi a4, a4, -256 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a2, a2, a3 ; RV64I-NEXT: or a0, a2, a0 -; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: sw a0, 0(a1) ; RV64I-NEXT: ret ; @@ -1243,29 +1243,29 @@ define i64 @bswap_i64(i64 %a) { ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 56 ; RV64I-NEXT: srli a2, a0, 56 -; RV64I-NEXT: or a1, a2, a1 -; RV64I-NEXT: lui a2, 16 -; RV64I-NEXT: addiw a2, a2, -256 -; RV64I-NEXT: and a3, a0, a2 -; RV64I-NEXT: slli a3, a3, 40 +; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: and a2, a4, a2 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: lui a2, 4080 -; RV64I-NEXT: and a3, a0, a2 -; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: addiw a3, a3, -256 +; RV64I-NEXT: and a4, a4, a3 +; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: srli a4, a0, 24 -; RV64I-NEXT: and a2, a4, a2 -; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: lui a3, 1044480 -; RV64I-NEXT: and a4, a0, a3 -; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: and a4, a4, a2 +; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: slli a2, a2, 24 ; RV64I-NEXT: or a2, a2, a4 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srli a0, a0, 8 -; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: lui a4, 1044480 +; RV64I-NEXT: and a3, a0, a3 +; RV64I-NEXT: slli a3, a3, 40 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: bswap_i64: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index 5cf2619a476bc..338476a1bec83 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -98,8 +98,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: pack_i64_2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a1, a0 @@ -147,8 +147,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: packh_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 16 -; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -157,8 +157,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64ZBKB-LABEL: packh_i32: ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: lui a2, 16 -; RV64ZBKB-NEXT: addiw a2, a2, -256 ; RV64ZBKB-NEXT: andi a0, a0, 255 +; RV64ZBKB-NEXT: addiw a2, a2, -256 ; RV64ZBKB-NEXT: slli a1, a1, 8 ; RV64ZBKB-NEXT: and a1, a1, a2 ; RV64ZBKB-NEXT: or a0, a1, a0 @@ -195,8 +195,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: packh_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 16 -; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -205,8 +205,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind { ; RV64ZBKB-LABEL: packh_i64: ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: lui a2, 16 -; RV64ZBKB-NEXT: addiw a2, a2, -256 ; RV64ZBKB-NEXT: andi a0, a0, 255 +; RV64ZBKB-NEXT: addiw a2, a2, -256 ; RV64ZBKB-NEXT: slli a1, a1, 8 ; RV64ZBKB-NEXT: and a1, a1, a2 ; RV64ZBKB-NEXT: or a0, a1, a0 @@ -307,10 +307,10 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) { ; RV64I-LABEL: pack_i64_allWUsers: ; RV64I: # %bb.0: ; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: slli a2, a2, 32 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: slli a2, a2, 32 ; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll index 972cc9d66bfb7..fc9be94988451 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll @@ -524,8 +524,8 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-NEXT: sw a6, 40(sp) ; ILP32-NEXT: sw a7, 44(sp) ; ILP32-NEXT: addi a1, a0, 7 -; ILP32-NEXT: andi a1, a1, -8 ; ILP32-NEXT: addi a0, a0, 15 +; ILP32-NEXT: andi a1, a1, -8 ; ILP32-NEXT: sw a0, 12(sp) ; ILP32-NEXT: lw a0, 0(a1) ; ILP32-NEXT: lw a1, 4(a1) @@ -618,8 +618,8 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV64-NEXT: sd a6, 64(sp) ; RV64-NEXT: sd a7, 72(sp) ; RV64-NEXT: addi a1, a0, 7 -; RV64-NEXT: andi a1, a1, -8 ; RV64-NEXT: addi a0, a0, 15 +; RV64-NEXT: andi a1, a1, -8 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 0(a1) ; RV64-NEXT: addi sp, sp, 80 @@ -642,8 +642,8 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) ; RV32-WITHFP-NEXT: addi a1, a0, 7 -; RV32-WITHFP-NEXT: andi a1, a1, -8 ; RV32-WITHFP-NEXT: addi a0, a0, 15 +; RV32-WITHFP-NEXT: andi a1, a1, -8 ; RV32-WITHFP-NEXT: sw a0, -12(s0) ; RV32-WITHFP-NEXT: lw a0, 0(a1) ; RV32-WITHFP-NEXT: lw a1, 4(a1) @@ -669,8 +669,8 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV64-WITHFP-NEXT: sd a6, 48(s0) ; RV64-WITHFP-NEXT: sd a7, 56(s0) ; RV64-WITHFP-NEXT: addi a1, a0, 7 -; RV64-WITHFP-NEXT: andi a1, a1, -8 ; RV64-WITHFP-NEXT: addi a0, a0, 15 +; RV64-WITHFP-NEXT: andi a1, a1, -8 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, 0(a1) ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -863,8 +863,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-NEXT: sw a6, 24(sp) ; ILP32-NEXT: sw a7, 28(sp) ; ILP32-NEXT: addi a3, a0, 7 -; ILP32-NEXT: andi a3, a3, -8 ; ILP32-NEXT: addi a0, a0, 15 +; ILP32-NEXT: andi a3, a3, -8 ; ILP32-NEXT: sw a0, 4(sp) ; ILP32-NEXT: lw a4, 0(a3) ; ILP32-NEXT: lw a3, 4(a3) @@ -966,8 +966,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV64-NEXT: sd a6, 48(sp) ; RV64-NEXT: sd a7, 56(sp) ; RV64-NEXT: addi a2, a0, 7 -; RV64-NEXT: andi a2, a2, -8 ; RV64-NEXT: addi a0, a0, 15 +; RV64-NEXT: andi a2, a2, -8 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 0(a2) ; RV64-NEXT: add a0, a1, a0 @@ -989,8 +989,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV32-WITHFP-NEXT: sw a6, 16(s0) ; RV32-WITHFP-NEXT: sw a7, 20(s0) ; RV32-WITHFP-NEXT: addi a3, a0, 7 -; RV32-WITHFP-NEXT: andi a3, a3, -8 ; RV32-WITHFP-NEXT: addi a0, a0, 15 +; RV32-WITHFP-NEXT: andi a3, a3, -8 ; RV32-WITHFP-NEXT: sw a0, -12(s0) ; RV32-WITHFP-NEXT: lw a4, 0(a3) ; RV32-WITHFP-NEXT: lw a3, 4(a3) @@ -1019,8 +1019,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV64-WITHFP-NEXT: sd a6, 32(s0) ; RV64-WITHFP-NEXT: sd a7, 40(s0) ; RV64-WITHFP-NEXT: addi a2, a0, 7 -; RV64-WITHFP-NEXT: andi a2, a2, -8 ; RV64-WITHFP-NEXT: addi a0, a0, 15 +; RV64-WITHFP-NEXT: andi a2, a2, -8 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, 0(a2) ; RV64-WITHFP-NEXT: add a0, a1, a0 @@ -1169,9 +1169,9 @@ define void @va3_caller() nounwind { ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a2, a0, -480 +; RV64-NEXT: lui a1, 5 ; RV64-NEXT: li a0, 2 +; RV64-NEXT: addiw a2, a1, -480 ; RV64-NEXT: li a1, 1111 ; RV64-NEXT: call va3 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1201,9 +1201,9 @@ define void @va3_caller() nounwind { ; RV64-WITHFP-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-WITHFP-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64-WITHFP-NEXT: addi s0, sp, 16 -; RV64-WITHFP-NEXT: lui a0, 5 -; RV64-WITHFP-NEXT: addiw a2, a0, -480 +; RV64-WITHFP-NEXT: lui a1, 5 ; RV64-WITHFP-NEXT: li a0, 2 +; RV64-WITHFP-NEXT: addiw a2, a1, -480 ; RV64-WITHFP-NEXT: li a1, 1111 ; RV64-WITHFP-NEXT: call va3 ; RV64-WITHFP-NEXT: ld ra, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index e7fd87bd78387..c9a48acb8d14a 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -11,8 +11,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_ext_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -23,8 +23,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_ext_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -62,8 +62,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i8_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -74,8 +74,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i8_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -113,8 +113,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_ext_i8_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -125,8 +125,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_ext_i8_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -164,8 +164,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -176,8 +176,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -266,8 +266,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i16_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -278,8 +278,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i16_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -625,11 +625,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) ; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw a2, 4(a2) -; RV32I-NEXT: sltu t3, a7, a6 +; RV32I-NEXT: sltu t3, t0, a6 ; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: beq t1, t2, .LBB11_2 ; RV32I-NEXT: # %bb.1: @@ -637,19 +637,19 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB11_2: ; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: sltu t6, a2, a4 -; RV32I-NEXT: mv t0, a5 +; RV32I-NEXT: mv a7, a5 ; RV32I-NEXT: beq a4, a2, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t0, t6 +; RV32I-NEXT: mv a7, t6 ; RV32I-NEXT: .LBB11_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: xor t5, t1, t2 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor s0, a6, t0 ; RV32I-NEXT: or t5, s0, t5 ; RV32I-NEXT: beqz t5, .LBB11_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t0, t4 +; RV32I-NEXT: mv a7, t4 ; RV32I-NEXT: .LBB11_6: ; RV32I-NEXT: mv t5, a5 ; RV32I-NEXT: beq a2, a4, .LBB11_8 @@ -662,27 +662,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB11_10: -; RV32I-NEXT: bnez t0, .LBB11_12 +; RV32I-NEXT: bnez a7, .LBB11_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t1, t2, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t1, t3 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub t0, t1, t3 ; RV32I-NEXT: sltu t1, a6, t5 -; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub t0, t0, t1 ; RV32I-NEXT: sub a6, a6, t5 ; RV32I-NEXT: j .LBB11_13 ; RV32I-NEXT: .LBB11_12: -; RV32I-NEXT: sltu t3, a6, a7 +; RV32I-NEXT: sltu t3, a6, t0 ; RV32I-NEXT: sub t1, t1, t2 -; RV32I-NEXT: sub t1, t1, t3 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a6, a6, t0 +; RV32I-NEXT: sub t0, t1, t3 +; RV32I-NEXT: sltu t1, a6, t6 +; RV32I-NEXT: sub t0, t0, t1 ; RV32I-NEXT: sub a6, a6, t6 ; RV32I-NEXT: .LBB11_13: ; RV32I-NEXT: snez t1, a6 -; RV32I-NEXT: add a7, a7, t1 -; RV32I-NEXT: bnez t0, .LBB11_15 +; RV32I-NEXT: add t0, t0, t1 +; RV32I-NEXT: bnez a7, .LBB11_15 ; RV32I-NEXT: # %bb.14: ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a2, a2, a5 @@ -694,20 +694,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: .LBB11_16: ; RV32I-NEXT: or a3, a1, a2 -; RV32I-NEXT: snez a3, a3 ; RV32I-NEXT: neg a4, a6 -; RV32I-NEXT: sltu a5, a4, a3 -; RV32I-NEXT: neg a6, a7 -; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: neg a5, t0 ; RV32I-NEXT: snez a6, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: snez a3, a3 ; RV32I-NEXT: add a2, a2, a6 +; RV32I-NEXT: sltu a6, a4, a3 ; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a3, a5, a6 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -744,11 +744,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) ; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) -; RV32ZBB-NEXT: sltu t3, a7, a6 +; RV32ZBB-NEXT: sltu t3, t0, a6 ; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: beq t1, t2, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: @@ -756,19 +756,19 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB11_2: ; RV32ZBB-NEXT: sltu a5, a1, a3 ; RV32ZBB-NEXT: sltu t6, a2, a4 -; RV32ZBB-NEXT: mv t0, a5 +; RV32ZBB-NEXT: mv a7, a5 ; RV32ZBB-NEXT: beq a4, a2, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t0, t6 +; RV32ZBB-NEXT: mv a7, t6 ; RV32ZBB-NEXT: .LBB11_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32ZBB-NEXT: xor t5, t1, t2 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor s0, a6, t0 ; RV32ZBB-NEXT: or t5, s0, t5 ; RV32ZBB-NEXT: beqz t5, .LBB11_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t0, t4 +; RV32ZBB-NEXT: mv a7, t4 ; RV32ZBB-NEXT: .LBB11_6: ; RV32ZBB-NEXT: mv t5, a5 ; RV32ZBB-NEXT: beq a2, a4, .LBB11_8 @@ -781,27 +781,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB11_10: -; RV32ZBB-NEXT: bnez t0, .LBB11_12 +; RV32ZBB-NEXT: bnez a7, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t1, t2, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t1, t3 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub t0, t1, t3 ; RV32ZBB-NEXT: sltu t1, a6, t5 -; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub t0, t0, t1 ; RV32ZBB-NEXT: sub a6, a6, t5 ; RV32ZBB-NEXT: j .LBB11_13 ; RV32ZBB-NEXT: .LBB11_12: -; RV32ZBB-NEXT: sltu t3, a6, a7 +; RV32ZBB-NEXT: sltu t3, a6, t0 ; RV32ZBB-NEXT: sub t1, t1, t2 -; RV32ZBB-NEXT: sub t1, t1, t3 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a6, a6, t0 +; RV32ZBB-NEXT: sub t0, t1, t3 +; RV32ZBB-NEXT: sltu t1, a6, t6 +; RV32ZBB-NEXT: sub t0, t0, t1 ; RV32ZBB-NEXT: sub a6, a6, t6 ; RV32ZBB-NEXT: .LBB11_13: ; RV32ZBB-NEXT: snez t1, a6 -; RV32ZBB-NEXT: add a7, a7, t1 -; RV32ZBB-NEXT: bnez t0, .LBB11_15 +; RV32ZBB-NEXT: add t0, t0, t1 +; RV32ZBB-NEXT: bnez a7, .LBB11_15 ; RV32ZBB-NEXT: # %bb.14: ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a2, a2, a5 @@ -813,20 +813,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: .LBB11_16: ; RV32ZBB-NEXT: or a3, a1, a2 -; RV32ZBB-NEXT: snez a3, a3 ; RV32ZBB-NEXT: neg a4, a6 -; RV32ZBB-NEXT: sltu a5, a4, a3 -; RV32ZBB-NEXT: neg a6, a7 -; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: neg a5, t0 ; RV32ZBB-NEXT: snez a6, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: snez a3, a3 ; RV32ZBB-NEXT: add a2, a2, a6 +; RV32ZBB-NEXT: sltu a6, a4, a3 ; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: sub a4, a4, a3 -; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a3, a5, a6 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a5, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -872,11 +872,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) ; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw a2, 4(a2) -; RV32I-NEXT: sltu t3, a7, a6 +; RV32I-NEXT: sltu t3, t0, a6 ; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: beq t1, t2, .LBB12_2 ; RV32I-NEXT: # %bb.1: @@ -884,19 +884,19 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB12_2: ; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: sltu t6, a2, a4 -; RV32I-NEXT: mv t0, a5 +; RV32I-NEXT: mv a7, a5 ; RV32I-NEXT: beq a4, a2, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t0, t6 +; RV32I-NEXT: mv a7, t6 ; RV32I-NEXT: .LBB12_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: xor t5, t1, t2 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor s0, a6, t0 ; RV32I-NEXT: or t5, s0, t5 ; RV32I-NEXT: beqz t5, .LBB12_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t0, t4 +; RV32I-NEXT: mv a7, t4 ; RV32I-NEXT: .LBB12_6: ; RV32I-NEXT: mv t5, a5 ; RV32I-NEXT: beq a2, a4, .LBB12_8 @@ -909,27 +909,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB12_10: -; RV32I-NEXT: bnez t0, .LBB12_12 +; RV32I-NEXT: bnez a7, .LBB12_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t1, t2, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t1, t3 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub t0, t1, t3 ; RV32I-NEXT: sltu t1, a6, t5 -; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub t0, t0, t1 ; RV32I-NEXT: sub a6, a6, t5 ; RV32I-NEXT: j .LBB12_13 ; RV32I-NEXT: .LBB12_12: -; RV32I-NEXT: sltu t3, a6, a7 +; RV32I-NEXT: sltu t3, a6, t0 ; RV32I-NEXT: sub t1, t1, t2 -; RV32I-NEXT: sub t1, t1, t3 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a6, a6, t0 +; RV32I-NEXT: sub t0, t1, t3 +; RV32I-NEXT: sltu t1, a6, t6 +; RV32I-NEXT: sub t0, t0, t1 ; RV32I-NEXT: sub a6, a6, t6 ; RV32I-NEXT: .LBB12_13: ; RV32I-NEXT: snez t1, a6 -; RV32I-NEXT: add a7, a7, t1 -; RV32I-NEXT: bnez t0, .LBB12_15 +; RV32I-NEXT: add t0, t0, t1 +; RV32I-NEXT: bnez a7, .LBB12_15 ; RV32I-NEXT: # %bb.14: ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a2, a2, a5 @@ -941,20 +941,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: .LBB12_16: ; RV32I-NEXT: or a3, a1, a2 -; RV32I-NEXT: snez a3, a3 ; RV32I-NEXT: neg a4, a6 -; RV32I-NEXT: sltu a5, a4, a3 -; RV32I-NEXT: neg a6, a7 -; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: neg a5, t0 ; RV32I-NEXT: snez a6, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: snez a3, a3 ; RV32I-NEXT: add a2, a2, a6 +; RV32I-NEXT: sltu a6, a4, a3 ; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a3, a5, a6 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -991,11 +991,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) ; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) -; RV32ZBB-NEXT: sltu t3, a7, a6 +; RV32ZBB-NEXT: sltu t3, t0, a6 ; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: beq t1, t2, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: @@ -1003,19 +1003,19 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB12_2: ; RV32ZBB-NEXT: sltu a5, a1, a3 ; RV32ZBB-NEXT: sltu t6, a2, a4 -; RV32ZBB-NEXT: mv t0, a5 +; RV32ZBB-NEXT: mv a7, a5 ; RV32ZBB-NEXT: beq a4, a2, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t0, t6 +; RV32ZBB-NEXT: mv a7, t6 ; RV32ZBB-NEXT: .LBB12_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32ZBB-NEXT: xor t5, t1, t2 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor s0, a6, t0 ; RV32ZBB-NEXT: or t5, s0, t5 ; RV32ZBB-NEXT: beqz t5, .LBB12_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t0, t4 +; RV32ZBB-NEXT: mv a7, t4 ; RV32ZBB-NEXT: .LBB12_6: ; RV32ZBB-NEXT: mv t5, a5 ; RV32ZBB-NEXT: beq a2, a4, .LBB12_8 @@ -1028,27 +1028,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB12_10: -; RV32ZBB-NEXT: bnez t0, .LBB12_12 +; RV32ZBB-NEXT: bnez a7, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t1, t2, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t1, t3 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub t0, t1, t3 ; RV32ZBB-NEXT: sltu t1, a6, t5 -; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub t0, t0, t1 ; RV32ZBB-NEXT: sub a6, a6, t5 ; RV32ZBB-NEXT: j .LBB12_13 ; RV32ZBB-NEXT: .LBB12_12: -; RV32ZBB-NEXT: sltu t3, a6, a7 +; RV32ZBB-NEXT: sltu t3, a6, t0 ; RV32ZBB-NEXT: sub t1, t1, t2 -; RV32ZBB-NEXT: sub t1, t1, t3 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a6, a6, t0 +; RV32ZBB-NEXT: sub t0, t1, t3 +; RV32ZBB-NEXT: sltu t1, a6, t6 +; RV32ZBB-NEXT: sub t0, t0, t1 ; RV32ZBB-NEXT: sub a6, a6, t6 ; RV32ZBB-NEXT: .LBB12_13: ; RV32ZBB-NEXT: snez t1, a6 -; RV32ZBB-NEXT: add a7, a7, t1 -; RV32ZBB-NEXT: bnez t0, .LBB12_15 +; RV32ZBB-NEXT: add t0, t0, t1 +; RV32ZBB-NEXT: bnez a7, .LBB12_15 ; RV32ZBB-NEXT: # %bb.14: ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a2, a2, a5 @@ -1060,20 +1060,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: .LBB12_16: ; RV32ZBB-NEXT: or a3, a1, a2 -; RV32ZBB-NEXT: snez a3, a3 ; RV32ZBB-NEXT: neg a4, a6 -; RV32ZBB-NEXT: sltu a5, a4, a3 -; RV32ZBB-NEXT: neg a6, a7 -; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: neg a5, t0 ; RV32ZBB-NEXT: snez a6, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: snez a3, a3 ; RV32ZBB-NEXT: add a2, a2, a6 +; RV32ZBB-NEXT: sltu a6, a4, a3 ; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: sub a4, a4, a3 -; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a3, a5, a6 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a5, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1120,8 +1120,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_minmax_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: bge a0, a1, .LBB13_3 @@ -1140,8 +1140,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_minmax_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: bge a0, a1, .LBB13_3 @@ -1175,8 +1175,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_minmax_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: bge a0, a1, .LBB14_3 @@ -1195,8 +1195,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_minmax_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: bge a0, a1, .LBB14_3 @@ -1449,26 +1449,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: mv a5, t0 ; RV32I-NEXT: mv a4, a7 ; RV32I-NEXT: .LBB17_19: -; RV32I-NEXT: sltu a6, t3, a4 -; RV32I-NEXT: sub a7, t4, a5 -; RV32I-NEXT: sltu a5, a2, a1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: mv a7, a5 +; RV32I-NEXT: sltu a7, t3, a4 +; RV32I-NEXT: sub a5, t4, a5 +; RV32I-NEXT: sltu a6, a2, a1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beq t1, a3, .LBB17_21 ; RV32I-NEXT: # %bb.20: ; RV32I-NEXT: sltu a7, t1, a3 ; RV32I-NEXT: .LBB17_21: ; RV32I-NEXT: sub a4, t3, a4 -; RV32I-NEXT: sltu t0, a4, a7 -; RV32I-NEXT: sub a6, a6, t0 -; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sub a3, t1, a3 -; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: sltu a1, a4, a7 +; RV32I-NEXT: sub a4, a4, a7 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a5, a5, a1 ; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a6, 12(a0) +; RV32I-NEXT: sw a5, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1576,26 +1576,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: mv a5, t0 ; RV32ZBB-NEXT: mv a4, a7 ; RV32ZBB-NEXT: .LBB17_19: -; RV32ZBB-NEXT: sltu a6, t3, a4 -; RV32ZBB-NEXT: sub a7, t4, a5 -; RV32ZBB-NEXT: sltu a5, a2, a1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: mv a7, a5 +; RV32ZBB-NEXT: sltu a7, t3, a4 +; RV32ZBB-NEXT: sub a5, t4, a5 +; RV32ZBB-NEXT: sltu a6, a2, a1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beq t1, a3, .LBB17_21 ; RV32ZBB-NEXT: # %bb.20: ; RV32ZBB-NEXT: sltu a7, t1, a3 ; RV32ZBB-NEXT: .LBB17_21: ; RV32ZBB-NEXT: sub a4, t3, a4 -; RV32ZBB-NEXT: sltu t0, a4, a7 -; RV32ZBB-NEXT: sub a6, a6, t0 -; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sub a3, t1, a3 -; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a2, a2, a1 +; RV32ZBB-NEXT: sltu a1, a4, a7 +; RV32ZBB-NEXT: sub a4, a4, a7 +; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a5, a5, a1 ; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a6, 12(a0) +; RV32ZBB-NEXT: sw a5, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1647,8 +1647,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_cmp_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a0, 24 -; RV32I-NEXT: srai a2, a2, 24 ; RV32I-NEXT: slli a3, a1, 24 +; RV32I-NEXT: srai a2, a2, 24 ; RV32I-NEXT: srai a3, a3, 24 ; RV32I-NEXT: bge a3, a2, .LBB18_2 ; RV32I-NEXT: # %bb.1: @@ -1661,8 +1661,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_cmp_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a0, 56 -; RV64I-NEXT: srai a2, a2, 56 ; RV64I-NEXT: slli a3, a1, 56 +; RV64I-NEXT: srai a2, a2, 56 ; RV64I-NEXT: srai a3, a3, 56 ; RV64I-NEXT: bge a3, a2, .LBB18_2 ; RV64I-NEXT: # %bb.1: @@ -1694,8 +1694,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_cmp_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: srai a2, a2, 16 ; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: srai a2, a2, 16 ; RV32I-NEXT: srai a3, a3, 16 ; RV32I-NEXT: blt a3, a2, .LBB19_2 ; RV32I-NEXT: # %bb.1: @@ -1708,8 +1708,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_cmp_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a1, 48 -; RV64I-NEXT: srai a2, a2, 48 ; RV64I-NEXT: slli a3, a0, 48 +; RV64I-NEXT: srai a2, a2, 48 ; RV64I-NEXT: srai a3, a3, 48 ; RV64I-NEXT: blt a3, a2, .LBB19_2 ; RV64I-NEXT: # %bb.1: @@ -1898,30 +1898,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t1, a5, a6 ; RV32I-NEXT: sub a7, a7, t0 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: sub a6, a5, a6 -; RV32I-NEXT: sltu a5, a6, t5 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a6, a6, t5 +; RV32I-NEXT: sub a5, a5, a6 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t4 +; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sltu a7, a5, t5 +; RV32I-NEXT: sub a1, a5, t5 +; RV32I-NEXT: sub a5, a4, t4 +; RV32I-NEXT: sub a4, a6, a7 ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: ; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a6, a6, a5 -; RV32I-NEXT: sub a5, a7, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a5, a5, a7 -; RV32I-NEXT: sub a6, a6, t3 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a4, a1, a4 +; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sltu a7, a5, t3 +; RV32I-NEXT: sub a1, a5, t3 +; RV32I-NEXT: sub a5, a4, t2 +; RV32I-NEXT: sub a4, a6, a7 ; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: .LBB22_11: ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: @@ -1985,30 +1985,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t1, a5, a6 ; RV32ZBB-NEXT: sub a7, a7, t0 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: sub a6, a5, a6 -; RV32ZBB-NEXT: sltu a5, a6, t5 -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a6, a6, t5 +; RV32ZBB-NEXT: sub a5, a5, a6 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t4 +; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sltu a7, a5, t5 +; RV32ZBB-NEXT: sub a1, a5, t5 +; RV32ZBB-NEXT: sub a5, a4, t4 +; RV32ZBB-NEXT: sub a4, a6, a7 ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: ; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a6, a6, a5 -; RV32ZBB-NEXT: sub a5, a7, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a5, a5, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a4, a1, a4 +; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sltu a7, a5, t3 +; RV32ZBB-NEXT: sub a1, a5, t3 +; RV32ZBB-NEXT: sub a5, a4, t2 +; RV32ZBB-NEXT: sub a4, a6, a7 ; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: .LBB22_11: ; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) -; RV32ZBB-NEXT: sw a6, 8(a0) -; RV32ZBB-NEXT: sw a5, 12(a0) +; RV32ZBB-NEXT: sw a5, 4(a0) +; RV32ZBB-NEXT: sw a1, 8(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: @@ -2289,12 +2289,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: srai a2, a1, 31 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 +; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 @@ -2312,12 +2312,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: srai a2, a1, 31 ; RV32ZBB-NEXT: xor a0, a0, a2 -; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 +; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 @@ -2340,12 +2340,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: srai a2, a1, 31 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 +; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 @@ -2363,12 +2363,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: srai a2, a1, 31 ; RV32ZBB-NEXT: xor a0, a0, a2 -; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 +; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 @@ -2392,64 +2392,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw t0, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: sub t0, t0, a7 -; RV32I-NEXT: sltu a7, a2, a3 -; RV32I-NEXT: sub t1, t0, t1 -; RV32I-NEXT: mv t0, a7 -; RV32I-NEXT: beq a1, a4, .LBB31_2 +; RV32I-NEXT: lw a7, 4(a1) +; RV32I-NEXT: sltu a1, t0, a5 +; RV32I-NEXT: sub t1, t1, a6 +; RV32I-NEXT: sltu a6, a2, a3 +; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: beq a7, a4, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a1, a4 +; RV32I-NEXT: sltu t1, a7, a4 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sltu a6, a5, t0 -; RV32I-NEXT: sub a6, t1, a6 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub t1, a1, a7 -; RV32I-NEXT: sub a4, a5, t0 +; RV32I-NEXT: sub a5, t0, a5 +; RV32I-NEXT: sub a4, a7, a4 ; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: srai a1, a6, 31 +; RV32I-NEXT: sltu a2, a5, t1 +; RV32I-NEXT: sub t0, a4, a6 +; RV32I-NEXT: sub a4, a5, t1 +; RV32I-NEXT: sub a5, a1, a2 +; RV32I-NEXT: srai a1, a5, 31 ; RV32I-NEXT: xor a2, a4, a1 -; RV32I-NEXT: sltu a4, a1, a2 -; RV32I-NEXT: xor a5, a6, a1 -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a4, a5, a4 -; RV32I-NEXT: xor a3, a3, a1 -; RV32I-NEXT: sltu a5, a1, a3 -; RV32I-NEXT: xor a6, t1, a1 -; RV32I-NEXT: mv a7, a5 -; RV32I-NEXT: beqz t1, .LBB31_4 +; RV32I-NEXT: xor a5, a5, a1 +; RV32I-NEXT: xor a4, a3, a1 +; RV32I-NEXT: sltu a3, a1, a2 +; RV32I-NEXT: sub a6, a1, a5 +; RV32I-NEXT: sltu a5, a1, a4 +; RV32I-NEXT: sub a3, a6, a3 +; RV32I-NEXT: xor a7, t0, a1 +; RV32I-NEXT: mv a6, a5 +; RV32I-NEXT: beqz t0, .LBB31_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu a7, a1, a6 +; RV32I-NEXT: sltu a6, a1, a7 ; RV32I-NEXT: .LBB31_4: ; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sltu t0, a2, a7 -; RV32I-NEXT: sub a4, a4, t0 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a1, a6 -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a7, a1, a7 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sltu a4, a2, a6 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128: ; RV64I: # %bb.0: ; RV64I-NEXT: sltu a4, a0, a2 ; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: srai a2, a1, 63 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: sltu a3, a2, a0 ; RV64I-NEXT: xor a1, a1, a2 +; RV64I-NEXT: sltu a3, a2, a0 ; RV64I-NEXT: sub a1, a2, a1 ; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a0, a2, a0 @@ -2460,64 +2460,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw t0, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: sub t0, t0, a7 -; RV32ZBB-NEXT: sltu a7, a2, a3 -; RV32ZBB-NEXT: sub t1, t0, t1 -; RV32ZBB-NEXT: mv t0, a7 -; RV32ZBB-NEXT: beq a1, a4, .LBB31_2 +; RV32ZBB-NEXT: lw a7, 4(a1) +; RV32ZBB-NEXT: sltu a1, t0, a5 +; RV32ZBB-NEXT: sub t1, t1, a6 +; RV32ZBB-NEXT: sltu a6, a2, a3 +; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: beq a7, a4, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t0, a1, a4 +; RV32ZBB-NEXT: sltu t1, a7, a4 ; RV32ZBB-NEXT: .LBB31_2: -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sltu a6, a5, t0 -; RV32ZBB-NEXT: sub a6, t1, a6 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub t1, a1, a7 -; RV32ZBB-NEXT: sub a4, a5, t0 +; RV32ZBB-NEXT: sub a5, t0, a5 +; RV32ZBB-NEXT: sub a4, a7, a4 ; RV32ZBB-NEXT: sub a3, a2, a3 -; RV32ZBB-NEXT: srai a1, a6, 31 +; RV32ZBB-NEXT: sltu a2, a5, t1 +; RV32ZBB-NEXT: sub t0, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t1 +; RV32ZBB-NEXT: sub a5, a1, a2 +; RV32ZBB-NEXT: srai a1, a5, 31 ; RV32ZBB-NEXT: xor a2, a4, a1 -; RV32ZBB-NEXT: sltu a4, a1, a2 -; RV32ZBB-NEXT: xor a5, a6, a1 -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a4, a5, a4 -; RV32ZBB-NEXT: xor a3, a3, a1 -; RV32ZBB-NEXT: sltu a5, a1, a3 -; RV32ZBB-NEXT: xor a6, t1, a1 -; RV32ZBB-NEXT: mv a7, a5 -; RV32ZBB-NEXT: beqz t1, .LBB31_4 +; RV32ZBB-NEXT: xor a5, a5, a1 +; RV32ZBB-NEXT: xor a4, a3, a1 +; RV32ZBB-NEXT: sltu a3, a1, a2 +; RV32ZBB-NEXT: sub a6, a1, a5 +; RV32ZBB-NEXT: sltu a5, a1, a4 +; RV32ZBB-NEXT: sub a3, a6, a3 +; RV32ZBB-NEXT: xor a7, t0, a1 +; RV32ZBB-NEXT: mv a6, a5 +; RV32ZBB-NEXT: beqz t0, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu a7, a1, a6 +; RV32ZBB-NEXT: sltu a6, a1, a7 ; RV32ZBB-NEXT: .LBB31_4: ; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sltu t0, a2, a7 -; RV32ZBB-NEXT: sub a4, a4, t0 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a1, a6 -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a7, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sltu a4, a2, a6 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sltu a4, a0, a2 ; RV64ZBB-NEXT: sub a1, a1, a3 -; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: srai a2, a1, 63 ; RV64ZBB-NEXT: xor a0, a0, a2 -; RV64ZBB-NEXT: sltu a3, a2, a0 ; RV64ZBB-NEXT: xor a1, a1, a2 +; RV64ZBB-NEXT: sltu a3, a2, a0 ; RV64ZBB-NEXT: sub a1, a2, a1 ; RV64ZBB-NEXT: sub a1, a1, a3 ; RV64ZBB-NEXT: sub a0, a2, a0 @@ -2534,64 +2534,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw t0, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: sub t0, t0, a7 -; RV32I-NEXT: sltu a7, a2, a3 -; RV32I-NEXT: sub t1, t0, t1 -; RV32I-NEXT: mv t0, a7 -; RV32I-NEXT: beq a1, a4, .LBB32_2 +; RV32I-NEXT: lw a7, 4(a1) +; RV32I-NEXT: sltu a1, t0, a5 +; RV32I-NEXT: sub t1, t1, a6 +; RV32I-NEXT: sltu a6, a2, a3 +; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: beq a7, a4, .LBB32_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a1, a4 +; RV32I-NEXT: sltu t1, a7, a4 ; RV32I-NEXT: .LBB32_2: -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sltu a6, a5, t0 -; RV32I-NEXT: sub a6, t1, a6 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub t1, a1, a7 -; RV32I-NEXT: sub a4, a5, t0 +; RV32I-NEXT: sub a5, t0, a5 +; RV32I-NEXT: sub a4, a7, a4 ; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: srai a1, a6, 31 +; RV32I-NEXT: sltu a2, a5, t1 +; RV32I-NEXT: sub t0, a4, a6 +; RV32I-NEXT: sub a4, a5, t1 +; RV32I-NEXT: sub a5, a1, a2 +; RV32I-NEXT: srai a1, a5, 31 ; RV32I-NEXT: xor a2, a4, a1 -; RV32I-NEXT: sltu a4, a1, a2 -; RV32I-NEXT: xor a5, a6, a1 -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a4, a5, a4 -; RV32I-NEXT: xor a3, a3, a1 -; RV32I-NEXT: sltu a5, a1, a3 -; RV32I-NEXT: xor a6, t1, a1 -; RV32I-NEXT: mv a7, a5 -; RV32I-NEXT: beqz t1, .LBB32_4 +; RV32I-NEXT: xor a5, a5, a1 +; RV32I-NEXT: xor a4, a3, a1 +; RV32I-NEXT: sltu a3, a1, a2 +; RV32I-NEXT: sub a6, a1, a5 +; RV32I-NEXT: sltu a5, a1, a4 +; RV32I-NEXT: sub a3, a6, a3 +; RV32I-NEXT: xor a7, t0, a1 +; RV32I-NEXT: mv a6, a5 +; RV32I-NEXT: beqz t0, .LBB32_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu a7, a1, a6 +; RV32I-NEXT: sltu a6, a1, a7 ; RV32I-NEXT: .LBB32_4: ; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sltu t0, a2, a7 -; RV32I-NEXT: sub a4, a4, t0 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a1, a6 -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a7, a1, a7 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sltu a4, a2, a6 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: sltu a4, a0, a2 ; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: srai a2, a1, 63 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: sltu a3, a2, a0 ; RV64I-NEXT: xor a1, a1, a2 +; RV64I-NEXT: sltu a3, a2, a0 ; RV64I-NEXT: sub a1, a2, a1 ; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a0, a2, a0 @@ -2602,64 +2602,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw t0, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: sub t0, t0, a7 -; RV32ZBB-NEXT: sltu a7, a2, a3 -; RV32ZBB-NEXT: sub t1, t0, t1 -; RV32ZBB-NEXT: mv t0, a7 -; RV32ZBB-NEXT: beq a1, a4, .LBB32_2 +; RV32ZBB-NEXT: lw a7, 4(a1) +; RV32ZBB-NEXT: sltu a1, t0, a5 +; RV32ZBB-NEXT: sub t1, t1, a6 +; RV32ZBB-NEXT: sltu a6, a2, a3 +; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: beq a7, a4, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t0, a1, a4 +; RV32ZBB-NEXT: sltu t1, a7, a4 ; RV32ZBB-NEXT: .LBB32_2: -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sltu a6, a5, t0 -; RV32ZBB-NEXT: sub a6, t1, a6 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub t1, a1, a7 -; RV32ZBB-NEXT: sub a4, a5, t0 +; RV32ZBB-NEXT: sub a5, t0, a5 +; RV32ZBB-NEXT: sub a4, a7, a4 ; RV32ZBB-NEXT: sub a3, a2, a3 -; RV32ZBB-NEXT: srai a1, a6, 31 +; RV32ZBB-NEXT: sltu a2, a5, t1 +; RV32ZBB-NEXT: sub t0, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t1 +; RV32ZBB-NEXT: sub a5, a1, a2 +; RV32ZBB-NEXT: srai a1, a5, 31 ; RV32ZBB-NEXT: xor a2, a4, a1 -; RV32ZBB-NEXT: sltu a4, a1, a2 -; RV32ZBB-NEXT: xor a5, a6, a1 -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a4, a5, a4 -; RV32ZBB-NEXT: xor a3, a3, a1 -; RV32ZBB-NEXT: sltu a5, a1, a3 -; RV32ZBB-NEXT: xor a6, t1, a1 -; RV32ZBB-NEXT: mv a7, a5 -; RV32ZBB-NEXT: beqz t1, .LBB32_4 +; RV32ZBB-NEXT: xor a5, a5, a1 +; RV32ZBB-NEXT: xor a4, a3, a1 +; RV32ZBB-NEXT: sltu a3, a1, a2 +; RV32ZBB-NEXT: sub a6, a1, a5 +; RV32ZBB-NEXT: sltu a5, a1, a4 +; RV32ZBB-NEXT: sub a3, a6, a3 +; RV32ZBB-NEXT: xor a7, t0, a1 +; RV32ZBB-NEXT: mv a6, a5 +; RV32ZBB-NEXT: beqz t0, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu a7, a1, a6 +; RV32ZBB-NEXT: sltu a6, a1, a7 ; RV32ZBB-NEXT: .LBB32_4: ; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sltu t0, a2, a7 -; RV32ZBB-NEXT: sub a4, a4, t0 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a1, a6 -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a7, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sltu a4, a2, a6 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128_undef: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sltu a4, a0, a2 ; RV64ZBB-NEXT: sub a1, a1, a3 -; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: sub a1, a1, a4 ; RV64ZBB-NEXT: srai a2, a1, 63 ; RV64ZBB-NEXT: xor a0, a0, a2 -; RV64ZBB-NEXT: sltu a3, a2, a0 ; RV64ZBB-NEXT: xor a1, a1, a2 +; RV64ZBB-NEXT: sltu a3, a2, a0 ; RV64ZBB-NEXT: sub a1, a2, a1 ; RV64ZBB-NEXT: sub a1, a1, a3 ; RV64ZBB-NEXT: sub a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 8208eafbc205c..56e6dacff9748 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -12,8 +12,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_ext_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -24,8 +24,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_ext_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -53,8 +53,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i8_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -65,8 +65,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i8_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -94,8 +94,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_ext_i8_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -106,8 +106,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_ext_i8_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -135,8 +135,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -147,8 +147,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -224,8 +224,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_ext_i16_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -236,8 +236,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i16_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -579,30 +579,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a4, t0, a2 +; RV32I-NEXT: sltu a5, a6, t4 +; RV32I-NEXT: sub a2, a1, t2 +; RV32I-NEXT: sub a1, a4, a5 ; RV32I-NEXT: sub a4, a6, t4 ; RV32I-NEXT: j .LBB11_13 ; RV32I-NEXT: .LBB11_12: ; RV32I-NEXT: sltu a2, a6, a7 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a1, t0, a2 +; RV32I-NEXT: sltu a5, a6, t6 +; RV32I-NEXT: sub a2, a4, t5 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a4, a6, t6 ; RV32I-NEXT: .LBB11_13: ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -675,30 +675,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t4 +; RV32ZBB-NEXT: sub a2, a1, t2 +; RV32ZBB-NEXT: sub a1, a4, a5 ; RV32ZBB-NEXT: sub a4, a6, t4 ; RV32ZBB-NEXT: j .LBB11_13 ; RV32ZBB-NEXT: .LBB11_12: ; RV32ZBB-NEXT: sltu a2, a6, a7 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a1, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t6 +; RV32ZBB-NEXT: sub a2, a4, t5 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, t6 ; RV32ZBB-NEXT: .LBB11_13: ; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -779,30 +779,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a4, t0, a2 +; RV32I-NEXT: sltu a5, a6, t4 +; RV32I-NEXT: sub a2, a1, t2 +; RV32I-NEXT: sub a1, a4, a5 ; RV32I-NEXT: sub a4, a6, t4 ; RV32I-NEXT: j .LBB12_13 ; RV32I-NEXT: .LBB12_12: ; RV32I-NEXT: sltu a2, a6, a7 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a1, t0, a2 +; RV32I-NEXT: sltu a5, a6, t6 +; RV32I-NEXT: sub a2, a4, t5 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a4, a6, t6 ; RV32I-NEXT: .LBB12_13: ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -875,30 +875,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t4 +; RV32ZBB-NEXT: sub a2, a1, t2 +; RV32ZBB-NEXT: sub a1, a4, a5 ; RV32ZBB-NEXT: sub a4, a6, t4 ; RV32ZBB-NEXT: j .LBB12_13 ; RV32ZBB-NEXT: .LBB12_12: ; RV32ZBB-NEXT: sltu a2, a6, a7 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a1, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t6 +; RV32ZBB-NEXT: sub a2, a4, t5 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, t6 ; RV32ZBB-NEXT: .LBB12_13: ; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -939,8 +939,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_minmax_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -951,8 +951,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_minmax_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -978,8 +978,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_minmax_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -990,8 +990,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_minmax_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1168,30 +1168,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a4, t0, a2 +; RV32I-NEXT: sltu a5, a6, t4 +; RV32I-NEXT: sub a2, a1, t2 +; RV32I-NEXT: sub a1, a4, a5 ; RV32I-NEXT: sub a4, a6, t4 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: ; RV32I-NEXT: sltu a2, a6, a7 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a1, t0, a2 +; RV32I-NEXT: sltu a5, a6, t6 +; RV32I-NEXT: sub a2, a4, t5 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a4, a6, t6 ; RV32I-NEXT: .LBB17_13: ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1264,30 +1264,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t4 +; RV32ZBB-NEXT: sub a2, a1, t2 +; RV32ZBB-NEXT: sub a1, a4, a5 ; RV32ZBB-NEXT: sub a4, a6, t4 ; RV32ZBB-NEXT: j .LBB17_13 ; RV32ZBB-NEXT: .LBB17_12: ; RV32ZBB-NEXT: sltu a2, a6, a7 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a1, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t6 +; RV32ZBB-NEXT: sub a2, a4, t5 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, t6 ; RV32ZBB-NEXT: .LBB17_13: ; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1326,8 +1326,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_cmp_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -1338,8 +1338,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_cmp_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1366,8 +1366,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_cmp_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -1378,8 +1378,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_cmp_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1559,30 +1559,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a4, t0, a2 +; RV32I-NEXT: sltu a5, a6, t4 +; RV32I-NEXT: sub a2, a1, t2 +; RV32I-NEXT: sub a1, a4, a5 ; RV32I-NEXT: sub a4, a6, t4 ; RV32I-NEXT: j .LBB22_13 ; RV32I-NEXT: .LBB22_12: ; RV32I-NEXT: sltu a2, a6, a7 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a1, t0, a2 +; RV32I-NEXT: sltu a5, a6, t6 +; RV32I-NEXT: sub a2, a4, t5 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a4, a6, t6 ; RV32I-NEXT: .LBB22_13: ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1655,30 +1655,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t4 +; RV32ZBB-NEXT: sub a2, a1, t2 +; RV32ZBB-NEXT: sub a1, a4, a5 ; RV32ZBB-NEXT: sub a4, a6, t4 ; RV32ZBB-NEXT: j .LBB22_13 ; RV32ZBB-NEXT: .LBB22_12: ; RV32ZBB-NEXT: sltu a2, a6, a7 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a1, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t6 +; RV32ZBB-NEXT: sub a2, a4, t5 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, t6 ; RV32ZBB-NEXT: .LBB22_13: ; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -2045,47 +2045,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a3, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a2, 8(a1) -; RV32I-NEXT: lw t1, 12(a1) -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: lw a7, 4(a1) -; RV32I-NEXT: sltu a1, a2, a6 -; RV32I-NEXT: sub t1, t1, t0 -; RV32I-NEXT: sltu t0, a5, a4 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, t0 -; RV32I-NEXT: beq a7, a3, .LBB31_2 +; RV32I-NEXT: lw a5, 8(a2) +; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: sltu t1, a7, a5 +; RV32I-NEXT: sub t0, t0, a6 +; RV32I-NEXT: sltu a6, a2, a4 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: beq a1, a3, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a7, a3 +; RV32I-NEXT: sltu t1, a1, a3 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: sub a2, a2, a6 -; RV32I-NEXT: sltu a6, a2, t1 -; RV32I-NEXT: sub a1, a1, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: sub a3, a3, t0 -; RV32I-NEXT: sub a4, a5, a4 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sltu a1, a5, t1 +; RV32I-NEXT: sub a5, a5, t1 +; RV32I-NEXT: sub a1, t0, a1 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: bgez a1, .LBB31_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a5, a3 -; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: neg a7, a2 -; RV32I-NEXT: sltu t0, a7, a5 -; RV32I-NEXT: snez a2, a2 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, t0 -; RV32I-NEXT: sub a2, a7, a5 -; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: snez a4, a3 +; RV32I-NEXT: snez a6, a2 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: add a1, a1, a5 ; RV32I-NEXT: add a3, a3, a6 +; RV32I-NEXT: sltu a6, a7, a4 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a5, a7, a4 +; RV32I-NEXT: sub a1, a1, a6 ; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB31_4: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a5, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -2108,47 +2108,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a3, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a2, 8(a1) -; RV32ZBB-NEXT: lw t1, 12(a1) -; RV32ZBB-NEXT: lw a5, 0(a1) -; RV32ZBB-NEXT: lw a7, 4(a1) -; RV32ZBB-NEXT: sltu a1, a2, a6 -; RV32ZBB-NEXT: sub t1, t1, t0 -; RV32ZBB-NEXT: sltu t0, a5, a4 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, t0 -; RV32ZBB-NEXT: beq a7, a3, .LBB31_2 +; RV32ZBB-NEXT: lw a5, 8(a2) +; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a1, 4(a1) +; RV32ZBB-NEXT: sltu t1, a7, a5 +; RV32ZBB-NEXT: sub t0, t0, a6 +; RV32ZBB-NEXT: sltu a6, a2, a4 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: beq a1, a3, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a7, a3 +; RV32ZBB-NEXT: sltu t1, a1, a3 ; RV32ZBB-NEXT: .LBB31_2: -; RV32ZBB-NEXT: sub a2, a2, a6 -; RV32ZBB-NEXT: sltu a6, a2, t1 -; RV32ZBB-NEXT: sub a1, a1, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: sub a3, a3, t0 -; RV32ZBB-NEXT: sub a4, a5, a4 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sltu a1, a5, t1 +; RV32ZBB-NEXT: sub a5, a5, t1 +; RV32ZBB-NEXT: sub a1, t0, a1 +; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: bgez a1, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a5, a3 -; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: or a5, a6, a5 -; RV32ZBB-NEXT: neg a7, a2 -; RV32ZBB-NEXT: sltu t0, a7, a5 -; RV32ZBB-NEXT: snez a2, a2 -; RV32ZBB-NEXT: add a1, a1, a2 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, t0 -; RV32ZBB-NEXT: sub a2, a7, a5 -; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: snez a4, a3 +; RV32ZBB-NEXT: snez a6, a2 +; RV32ZBB-NEXT: neg a7, a5 +; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: or a4, a6, a4 +; RV32ZBB-NEXT: add a1, a1, a5 ; RV32ZBB-NEXT: add a3, a3, a6 +; RV32ZBB-NEXT: sltu a6, a7, a4 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a5, a7, a4 +; RV32ZBB-NEXT: sub a1, a1, a6 ; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB31_4: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) +; RV32ZBB-NEXT: sw a5, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; @@ -2176,47 +2176,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a3, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a2, 8(a1) -; RV32I-NEXT: lw t1, 12(a1) -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: lw a7, 4(a1) -; RV32I-NEXT: sltu a1, a2, a6 -; RV32I-NEXT: sub t1, t1, t0 -; RV32I-NEXT: sltu t0, a5, a4 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, t0 -; RV32I-NEXT: beq a7, a3, .LBB32_2 +; RV32I-NEXT: lw a5, 8(a2) +; RV32I-NEXT: lw a6, 12(a2) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: sltu t1, a7, a5 +; RV32I-NEXT: sub t0, t0, a6 +; RV32I-NEXT: sltu a6, a2, a4 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: beq a1, a3, .LBB32_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a7, a3 +; RV32I-NEXT: sltu t1, a1, a3 ; RV32I-NEXT: .LBB32_2: -; RV32I-NEXT: sub a2, a2, a6 -; RV32I-NEXT: sltu a6, a2, t1 -; RV32I-NEXT: sub a1, a1, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: sub a3, a3, t0 -; RV32I-NEXT: sub a4, a5, a4 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sltu a1, a5, t1 +; RV32I-NEXT: sub a5, a5, t1 +; RV32I-NEXT: sub a1, t0, a1 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: bgez a1, .LBB32_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: snez a5, a3 -; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: neg a7, a2 -; RV32I-NEXT: sltu t0, a7, a5 -; RV32I-NEXT: snez a2, a2 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, t0 -; RV32I-NEXT: sub a2, a7, a5 -; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: snez a4, a3 +; RV32I-NEXT: snez a6, a2 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: add a1, a1, a5 ; RV32I-NEXT: add a3, a3, a6 +; RV32I-NEXT: sltu a6, a7, a4 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a5, a7, a4 +; RV32I-NEXT: sub a1, a1, a6 ; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB32_4: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a5, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -2239,47 +2239,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a3, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a2, 8(a1) -; RV32ZBB-NEXT: lw t1, 12(a1) -; RV32ZBB-NEXT: lw a5, 0(a1) -; RV32ZBB-NEXT: lw a7, 4(a1) -; RV32ZBB-NEXT: sltu a1, a2, a6 -; RV32ZBB-NEXT: sub t1, t1, t0 -; RV32ZBB-NEXT: sltu t0, a5, a4 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, t0 -; RV32ZBB-NEXT: beq a7, a3, .LBB32_2 +; RV32ZBB-NEXT: lw a5, 8(a2) +; RV32ZBB-NEXT: lw a6, 12(a2) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a1, 4(a1) +; RV32ZBB-NEXT: sltu t1, a7, a5 +; RV32ZBB-NEXT: sub t0, t0, a6 +; RV32ZBB-NEXT: sltu a6, a2, a4 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: beq a1, a3, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a7, a3 +; RV32ZBB-NEXT: sltu t1, a1, a3 ; RV32ZBB-NEXT: .LBB32_2: -; RV32ZBB-NEXT: sub a2, a2, a6 -; RV32ZBB-NEXT: sltu a6, a2, t1 -; RV32ZBB-NEXT: sub a1, a1, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: sub a3, a3, t0 -; RV32ZBB-NEXT: sub a4, a5, a4 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sltu a1, a5, t1 +; RV32ZBB-NEXT: sub a5, a5, t1 +; RV32ZBB-NEXT: sub a1, t0, a1 +; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: bgez a1, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: snez a5, a3 -; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: or a5, a6, a5 -; RV32ZBB-NEXT: neg a7, a2 -; RV32ZBB-NEXT: sltu t0, a7, a5 -; RV32ZBB-NEXT: snez a2, a2 -; RV32ZBB-NEXT: add a1, a1, a2 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, t0 -; RV32ZBB-NEXT: sub a2, a7, a5 -; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: snez a4, a3 +; RV32ZBB-NEXT: snez a6, a2 +; RV32ZBB-NEXT: neg a7, a5 +; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: or a4, a6, a4 +; RV32ZBB-NEXT: add a1, a1, a5 ; RV32ZBB-NEXT: add a3, a3, a6 +; RV32ZBB-NEXT: sltu a6, a7, a4 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a5, a7, a4 +; RV32ZBB-NEXT: sub a1, a1, a6 ; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB32_4: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) +; RV32ZBB-NEXT: sw a5, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; @@ -2349,8 +2349,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_select_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -2361,8 +2361,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: abd_select_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -2389,8 +2389,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_select_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: srai a1, a0, 31 @@ -2401,8 +2401,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_select_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -2582,30 +2582,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sltu a7, a6, t4 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a4, t0, a2 +; RV32I-NEXT: sltu a5, a6, t4 +; RV32I-NEXT: sub a2, a1, t2 +; RV32I-NEXT: sub a1, a4, a5 ; RV32I-NEXT: sub a4, a6, t4 ; RV32I-NEXT: j .LBB38_13 ; RV32I-NEXT: .LBB38_12: ; RV32I-NEXT: sltu a2, a6, a7 ; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t6 -; RV32I-NEXT: sub a2, a2, a7 ; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a1, t0, a2 +; RV32I-NEXT: sltu a5, a6, t6 +; RV32I-NEXT: sub a2, a4, t5 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a4, a6, t6 ; RV32I-NEXT: .LBB38_13: ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a2, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -2678,30 +2678,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sltu a7, a6, t4 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t4 +; RV32ZBB-NEXT: sub a2, a1, t2 +; RV32ZBB-NEXT: sub a1, a4, a5 ; RV32ZBB-NEXT: sub a4, a6, t4 ; RV32ZBB-NEXT: j .LBB38_13 ; RV32ZBB-NEXT: .LBB38_12: ; RV32ZBB-NEXT: sltu a2, a6, a7 ; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t6 -; RV32ZBB-NEXT: sub a2, a2, a7 ; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a1, t0, a2 +; RV32ZBB-NEXT: sltu a5, a6, t6 +; RV32ZBB-NEXT: sub a2, a4, t5 +; RV32ZBB-NEXT: sub a1, a1, a5 ; RV32ZBB-NEXT: sub a4, a6, t6 ; RV32ZBB-NEXT: .LBB38_13: ; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index b39285c3d343f..9e41cde7ae181 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -220,8 +220,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i16_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -322,8 +322,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -341,8 +341,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 32 -; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: neg a1, a0 @@ -375,8 +375,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i32_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -426,8 +426,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i32_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -445,8 +445,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i32_undef: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 32 -; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: neg a1, a0 @@ -477,13 +477,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB9_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 ; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: snez a1, a0 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: neg a1, a1 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret @@ -515,13 +515,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB9_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 ; RV32ZBB-NEXT: add a0, a2, a0 -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: sub a3, a3, a1 +; RV32ZBB-NEXT: snez a1, a0 +; RV32ZBB-NEXT: add a1, a3, a1 ; RV32ZBB-NEXT: neg a1, a1 ; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: ret @@ -557,13 +557,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB10_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 ; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: snez a1, a0 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: neg a1, a1 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret @@ -595,13 +595,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB10_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 ; RV32ZBB-NEXT: add a0, a2, a0 -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: sub a3, a3, a1 +; RV32ZBB-NEXT: snez a1, a0 +; RV32ZBB-NEXT: add a1, a3, a1 ; RV32ZBB-NEXT: neg a1, a1 ; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: ret @@ -624,87 +624,87 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a2) -; RV32I-NEXT: lw a6, 4(a2) -; RV32I-NEXT: lw t1, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a7, 0(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a7, 4(a2) +; RV32I-NEXT: lw a3, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a3, t1 -; RV32I-NEXT: sub a2, a5, a2 -; RV32I-NEXT: sltu t2, a7, a4 -; RV32I-NEXT: sub a1, a2, a1 -; RV32I-NEXT: mv a2, t2 -; RV32I-NEXT: beq t0, a6, .LBB11_2 +; RV32I-NEXT: sltu a1, a4, a3 +; RV32I-NEXT: sub t1, a6, t1 +; RV32I-NEXT: sltu t2, a2, a5 +; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: beq t0, a7, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a2, t0, a6 +; RV32I-NEXT: sltu t1, t0, a7 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sub t1, a3, t1 -; RV32I-NEXT: sltu t3, t1, a2 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: sltu t3, a3, t1 ; RV32I-NEXT: sub a1, a1, t3 -; RV32I-NEXT: sub a2, t1, a2 -; RV32I-NEXT: beq a1, a5, .LBB11_4 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: beq a1, a6, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a5, a1 +; RV32I-NEXT: sltu t1, a6, a1 ; RV32I-NEXT: j .LBB11_5 ; RV32I-NEXT: .LBB11_4: -; RV32I-NEXT: sltu t1, a3, a2 +; RV32I-NEXT: sltu t1, a4, a3 ; RV32I-NEXT: .LBB11_5: -; RV32I-NEXT: sub a6, t0, a6 -; RV32I-NEXT: sub a6, a6, t2 -; RV32I-NEXT: sub t2, a7, a4 -; RV32I-NEXT: beq a6, t0, .LBB11_7 +; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sub a7, a7, t2 +; RV32I-NEXT: sub a5, a2, a5 +; RV32I-NEXT: beq a7, t0, .LBB11_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a4, t0, a6 +; RV32I-NEXT: sltu a2, t0, a7 ; RV32I-NEXT: j .LBB11_8 ; RV32I-NEXT: .LBB11_7: -; RV32I-NEXT: sltu a4, a7, t2 +; RV32I-NEXT: sltu a2, a2, a5 ; RV32I-NEXT: .LBB11_8: -; RV32I-NEXT: xor a5, a1, a5 -; RV32I-NEXT: xor a3, a2, a3 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: beqz a3, .LBB11_10 +; RV32I-NEXT: xor a6, a1, a6 +; RV32I-NEXT: xor a4, a3, a4 +; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: beqz a4, .LBB11_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a4, t1 +; RV32I-NEXT: mv a2, t1 ; RV32I-NEXT: .LBB11_10: -; RV32I-NEXT: neg t0, a4 -; RV32I-NEXT: xor a5, t2, t0 -; RV32I-NEXT: sltu t2, a5, t0 -; RV32I-NEXT: xor t3, a6, t0 -; RV32I-NEXT: add a3, t3, a4 -; RV32I-NEXT: sub a3, a3, t2 -; RV32I-NEXT: snez t1, a3 -; RV32I-NEXT: add a5, a5, a4 -; RV32I-NEXT: snez a7, a5 -; RV32I-NEXT: or t1, a7, t1 -; RV32I-NEXT: beqz a6, .LBB11_12 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: xor t0, a5, a4 +; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: sltu a5, t0, a4 +; RV32I-NEXT: add a6, t3, a2 +; RV32I-NEXT: add t0, t0, a2 +; RV32I-NEXT: sub t1, a6, a5 +; RV32I-NEXT: snez a6, t1 +; RV32I-NEXT: snez t2, t0 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: beqz a7, .LBB11_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t2, t3, t0 +; RV32I-NEXT: sltu a5, t3, a4 ; RV32I-NEXT: .LBB11_12: -; RV32I-NEXT: xor a2, a2, t0 -; RV32I-NEXT: add a6, a2, a4 -; RV32I-NEXT: sub t3, a6, t2 -; RV32I-NEXT: neg t4, t3 -; RV32I-NEXT: sltu t5, t4, t1 -; RV32I-NEXT: sltu a2, a2, t0 -; RV32I-NEXT: xor a1, a1, t0 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sltu a2, a6, t2 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: snez a2, t3 +; RV32I-NEXT: xor a3, a3, a4 +; RV32I-NEXT: xor a1, a1, a4 +; RV32I-NEXT: add t1, t1, t2 +; RV32I-NEXT: neg a7, t0 +; RV32I-NEXT: add t0, a3, a2 +; RV32I-NEXT: sltu a3, a3, a4 ; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: neg a2, t1 +; RV32I-NEXT: sub a4, t0, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sltu a3, t0, a5 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: snez a3, a4 +; RV32I-NEXT: sltu a4, a5, a6 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: sub a3, a5, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, t5 -; RV32I-NEXT: sub a2, t4, t1 -; RV32I-NEXT: add a3, a3, a7 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: neg a4, a5 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sw a7, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a3, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -723,100 +723,100 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB11_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 ; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: snez a2, a0 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: sub a3, a3, a1 +; RV64I-NEXT: snez a1, a0 +; RV64I-NEXT: add a1, a3, a1 ; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_ext_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a2) -; RV32ZBB-NEXT: lw a6, 4(a2) -; RV32ZBB-NEXT: lw t1, 8(a2) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw a3, 8(a1) -; RV32ZBB-NEXT: lw a5, 12(a1) -; RV32ZBB-NEXT: lw a7, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a7, 4(a2) +; RV32ZBB-NEXT: lw a3, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a3, t1 -; RV32ZBB-NEXT: sub a2, a5, a2 -; RV32ZBB-NEXT: sltu t2, a7, a4 -; RV32ZBB-NEXT: sub a1, a2, a1 -; RV32ZBB-NEXT: mv a2, t2 -; RV32ZBB-NEXT: beq t0, a6, .LBB11_2 +; RV32ZBB-NEXT: sltu a1, a4, a3 +; RV32ZBB-NEXT: sub t1, a6, t1 +; RV32ZBB-NEXT: sltu t2, a2, a5 +; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: mv t1, t2 +; RV32ZBB-NEXT: beq t0, a7, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a2, t0, a6 +; RV32ZBB-NEXT: sltu t1, t0, a7 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sub t1, a3, t1 -; RV32ZBB-NEXT: sltu t3, t1, a2 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: sltu t3, a3, t1 ; RV32ZBB-NEXT: sub a1, a1, t3 -; RV32ZBB-NEXT: sub a2, t1, a2 -; RV32ZBB-NEXT: beq a1, a5, .LBB11_4 +; RV32ZBB-NEXT: sub a3, a3, t1 +; RV32ZBB-NEXT: beq a1, a6, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a5, a1 +; RV32ZBB-NEXT: sltu t1, a6, a1 ; RV32ZBB-NEXT: j .LBB11_5 ; RV32ZBB-NEXT: .LBB11_4: -; RV32ZBB-NEXT: sltu t1, a3, a2 +; RV32ZBB-NEXT: sltu t1, a4, a3 ; RV32ZBB-NEXT: .LBB11_5: -; RV32ZBB-NEXT: sub a6, t0, a6 -; RV32ZBB-NEXT: sub a6, a6, t2 -; RV32ZBB-NEXT: sub t2, a7, a4 -; RV32ZBB-NEXT: beq a6, t0, .LBB11_7 +; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sub a7, a7, t2 +; RV32ZBB-NEXT: sub a5, a2, a5 +; RV32ZBB-NEXT: beq a7, t0, .LBB11_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a4, t0, a6 +; RV32ZBB-NEXT: sltu a2, t0, a7 ; RV32ZBB-NEXT: j .LBB11_8 ; RV32ZBB-NEXT: .LBB11_7: -; RV32ZBB-NEXT: sltu a4, a7, t2 +; RV32ZBB-NEXT: sltu a2, a2, a5 ; RV32ZBB-NEXT: .LBB11_8: -; RV32ZBB-NEXT: xor a5, a1, a5 -; RV32ZBB-NEXT: xor a3, a2, a3 -; RV32ZBB-NEXT: or a3, a3, a5 -; RV32ZBB-NEXT: beqz a3, .LBB11_10 +; RV32ZBB-NEXT: xor a6, a1, a6 +; RV32ZBB-NEXT: xor a4, a3, a4 +; RV32ZBB-NEXT: or a4, a4, a6 +; RV32ZBB-NEXT: beqz a4, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a4, t1 +; RV32ZBB-NEXT: mv a2, t1 ; RV32ZBB-NEXT: .LBB11_10: -; RV32ZBB-NEXT: neg t0, a4 -; RV32ZBB-NEXT: xor a5, t2, t0 -; RV32ZBB-NEXT: sltu t2, a5, t0 -; RV32ZBB-NEXT: xor t3, a6, t0 -; RV32ZBB-NEXT: add a3, t3, a4 -; RV32ZBB-NEXT: sub a3, a3, t2 -; RV32ZBB-NEXT: snez t1, a3 -; RV32ZBB-NEXT: add a5, a5, a4 -; RV32ZBB-NEXT: snez a7, a5 -; RV32ZBB-NEXT: or t1, a7, t1 -; RV32ZBB-NEXT: beqz a6, .LBB11_12 +; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: xor t0, a5, a4 +; RV32ZBB-NEXT: xor t3, a7, a4 +; RV32ZBB-NEXT: sltu a5, t0, a4 +; RV32ZBB-NEXT: add a6, t3, a2 +; RV32ZBB-NEXT: add t0, t0, a2 +; RV32ZBB-NEXT: sub t1, a6, a5 +; RV32ZBB-NEXT: snez a6, t1 +; RV32ZBB-NEXT: snez t2, t0 +; RV32ZBB-NEXT: or a6, t2, a6 +; RV32ZBB-NEXT: beqz a7, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t2, t3, t0 +; RV32ZBB-NEXT: sltu a5, t3, a4 ; RV32ZBB-NEXT: .LBB11_12: -; RV32ZBB-NEXT: xor a2, a2, t0 -; RV32ZBB-NEXT: add a6, a2, a4 -; RV32ZBB-NEXT: sub t3, a6, t2 -; RV32ZBB-NEXT: neg t4, t3 -; RV32ZBB-NEXT: sltu t5, t4, t1 -; RV32ZBB-NEXT: sltu a2, a2, t0 -; RV32ZBB-NEXT: xor a1, a1, t0 -; RV32ZBB-NEXT: add a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, a2 -; RV32ZBB-NEXT: sltu a2, a6, t2 -; RV32ZBB-NEXT: sub a1, a1, a2 -; RV32ZBB-NEXT: snez a2, t3 +; RV32ZBB-NEXT: xor a3, a3, a4 +; RV32ZBB-NEXT: xor a1, a1, a4 +; RV32ZBB-NEXT: add t1, t1, t2 +; RV32ZBB-NEXT: neg a7, t0 +; RV32ZBB-NEXT: add t0, a3, a2 +; RV32ZBB-NEXT: sltu a3, a3, a4 ; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: neg a2, t1 +; RV32ZBB-NEXT: sub a4, t0, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sltu a3, t0, a5 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: snez a3, a4 +; RV32ZBB-NEXT: sltu a4, a5, a6 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: sub a3, a5, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, t5 -; RV32ZBB-NEXT: sub a2, t4, t1 -; RV32ZBB-NEXT: add a3, a3, a7 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: neg a4, a5 -; RV32ZBB-NEXT: sw a4, 0(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sw a7, 0(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; @@ -835,13 +835,13 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB11_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 ; RV64ZBB-NEXT: add a0, a2, a0 -; RV64ZBB-NEXT: snez a2, a0 -; RV64ZBB-NEXT: add a1, a1, a2 +; RV64ZBB-NEXT: sub a3, a3, a1 +; RV64ZBB-NEXT: snez a1, a0 +; RV64ZBB-NEXT: add a1, a3, a1 ; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: neg a0, a0 ; RV64ZBB-NEXT: ret @@ -857,87 +857,87 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a2) -; RV32I-NEXT: lw a6, 4(a2) -; RV32I-NEXT: lw t1, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a7, 0(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a7, 4(a2) +; RV32I-NEXT: lw a3, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a3, t1 -; RV32I-NEXT: sub a2, a5, a2 -; RV32I-NEXT: sltu t2, a7, a4 -; RV32I-NEXT: sub a1, a2, a1 -; RV32I-NEXT: mv a2, t2 -; RV32I-NEXT: beq t0, a6, .LBB12_2 +; RV32I-NEXT: sltu a1, a4, a3 +; RV32I-NEXT: sub t1, a6, t1 +; RV32I-NEXT: sltu t2, a2, a5 +; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: beq t0, a7, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a2, t0, a6 +; RV32I-NEXT: sltu t1, t0, a7 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sub t1, a3, t1 -; RV32I-NEXT: sltu t3, t1, a2 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: sltu t3, a3, t1 ; RV32I-NEXT: sub a1, a1, t3 -; RV32I-NEXT: sub a2, t1, a2 -; RV32I-NEXT: beq a1, a5, .LBB12_4 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: beq a1, a6, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a5, a1 +; RV32I-NEXT: sltu t1, a6, a1 ; RV32I-NEXT: j .LBB12_5 ; RV32I-NEXT: .LBB12_4: -; RV32I-NEXT: sltu t1, a3, a2 +; RV32I-NEXT: sltu t1, a4, a3 ; RV32I-NEXT: .LBB12_5: -; RV32I-NEXT: sub a6, t0, a6 -; RV32I-NEXT: sub a6, a6, t2 -; RV32I-NEXT: sub t2, a7, a4 -; RV32I-NEXT: beq a6, t0, .LBB12_7 +; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sub a7, a7, t2 +; RV32I-NEXT: sub a5, a2, a5 +; RV32I-NEXT: beq a7, t0, .LBB12_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a4, t0, a6 +; RV32I-NEXT: sltu a2, t0, a7 ; RV32I-NEXT: j .LBB12_8 ; RV32I-NEXT: .LBB12_7: -; RV32I-NEXT: sltu a4, a7, t2 +; RV32I-NEXT: sltu a2, a2, a5 ; RV32I-NEXT: .LBB12_8: -; RV32I-NEXT: xor a5, a1, a5 -; RV32I-NEXT: xor a3, a2, a3 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: beqz a3, .LBB12_10 +; RV32I-NEXT: xor a6, a1, a6 +; RV32I-NEXT: xor a4, a3, a4 +; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: beqz a4, .LBB12_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a4, t1 +; RV32I-NEXT: mv a2, t1 ; RV32I-NEXT: .LBB12_10: -; RV32I-NEXT: neg t0, a4 -; RV32I-NEXT: xor a5, t2, t0 -; RV32I-NEXT: sltu t2, a5, t0 -; RV32I-NEXT: xor t3, a6, t0 -; RV32I-NEXT: add a3, t3, a4 -; RV32I-NEXT: sub a3, a3, t2 -; RV32I-NEXT: snez t1, a3 -; RV32I-NEXT: add a5, a5, a4 -; RV32I-NEXT: snez a7, a5 -; RV32I-NEXT: or t1, a7, t1 -; RV32I-NEXT: beqz a6, .LBB12_12 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: xor t0, a5, a4 +; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: sltu a5, t0, a4 +; RV32I-NEXT: add a6, t3, a2 +; RV32I-NEXT: add t0, t0, a2 +; RV32I-NEXT: sub t1, a6, a5 +; RV32I-NEXT: snez a6, t1 +; RV32I-NEXT: snez t2, t0 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: beqz a7, .LBB12_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t2, t3, t0 +; RV32I-NEXT: sltu a5, t3, a4 ; RV32I-NEXT: .LBB12_12: -; RV32I-NEXT: xor a2, a2, t0 -; RV32I-NEXT: add a6, a2, a4 -; RV32I-NEXT: sub t3, a6, t2 -; RV32I-NEXT: neg t4, t3 -; RV32I-NEXT: sltu t5, t4, t1 -; RV32I-NEXT: sltu a2, a2, t0 -; RV32I-NEXT: xor a1, a1, t0 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sltu a2, a6, t2 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: snez a2, t3 +; RV32I-NEXT: xor a3, a3, a4 +; RV32I-NEXT: xor a1, a1, a4 +; RV32I-NEXT: add t1, t1, t2 +; RV32I-NEXT: neg a7, t0 +; RV32I-NEXT: add t0, a3, a2 +; RV32I-NEXT: sltu a3, a3, a4 ; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: neg a2, t1 +; RV32I-NEXT: sub a4, t0, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sltu a3, t0, a5 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: snez a3, a4 +; RV32I-NEXT: sltu a4, a5, a6 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: sub a3, a5, a6 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, t5 -; RV32I-NEXT: sub a2, t4, t1 -; RV32I-NEXT: add a3, a3, a7 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: neg a4, a5 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: sw a7, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a3, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; @@ -956,100 +956,100 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB12_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 ; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: snez a2, a0 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: sub a3, a3, a1 +; RV64I-NEXT: snez a1, a0 +; RV64I-NEXT: add a1, a3, a1 ; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_ext_i128_undef: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a2) -; RV32ZBB-NEXT: lw a6, 4(a2) -; RV32ZBB-NEXT: lw t1, 8(a2) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw a3, 8(a1) -; RV32ZBB-NEXT: lw a5, 12(a1) -; RV32ZBB-NEXT: lw a7, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a7, 4(a2) +; RV32ZBB-NEXT: lw a3, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a3, t1 -; RV32ZBB-NEXT: sub a2, a5, a2 -; RV32ZBB-NEXT: sltu t2, a7, a4 -; RV32ZBB-NEXT: sub a1, a2, a1 -; RV32ZBB-NEXT: mv a2, t2 -; RV32ZBB-NEXT: beq t0, a6, .LBB12_2 +; RV32ZBB-NEXT: sltu a1, a4, a3 +; RV32ZBB-NEXT: sub t1, a6, t1 +; RV32ZBB-NEXT: sltu t2, a2, a5 +; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: mv t1, t2 +; RV32ZBB-NEXT: beq t0, a7, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a2, t0, a6 +; RV32ZBB-NEXT: sltu t1, t0, a7 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sub t1, a3, t1 -; RV32ZBB-NEXT: sltu t3, t1, a2 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: sltu t3, a3, t1 ; RV32ZBB-NEXT: sub a1, a1, t3 -; RV32ZBB-NEXT: sub a2, t1, a2 -; RV32ZBB-NEXT: beq a1, a5, .LBB12_4 +; RV32ZBB-NEXT: sub a3, a3, t1 +; RV32ZBB-NEXT: beq a1, a6, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a5, a1 +; RV32ZBB-NEXT: sltu t1, a6, a1 ; RV32ZBB-NEXT: j .LBB12_5 ; RV32ZBB-NEXT: .LBB12_4: -; RV32ZBB-NEXT: sltu t1, a3, a2 +; RV32ZBB-NEXT: sltu t1, a4, a3 ; RV32ZBB-NEXT: .LBB12_5: -; RV32ZBB-NEXT: sub a6, t0, a6 -; RV32ZBB-NEXT: sub a6, a6, t2 -; RV32ZBB-NEXT: sub t2, a7, a4 -; RV32ZBB-NEXT: beq a6, t0, .LBB12_7 +; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sub a7, a7, t2 +; RV32ZBB-NEXT: sub a5, a2, a5 +; RV32ZBB-NEXT: beq a7, t0, .LBB12_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a4, t0, a6 +; RV32ZBB-NEXT: sltu a2, t0, a7 ; RV32ZBB-NEXT: j .LBB12_8 ; RV32ZBB-NEXT: .LBB12_7: -; RV32ZBB-NEXT: sltu a4, a7, t2 +; RV32ZBB-NEXT: sltu a2, a2, a5 ; RV32ZBB-NEXT: .LBB12_8: -; RV32ZBB-NEXT: xor a5, a1, a5 -; RV32ZBB-NEXT: xor a3, a2, a3 -; RV32ZBB-NEXT: or a3, a3, a5 -; RV32ZBB-NEXT: beqz a3, .LBB12_10 +; RV32ZBB-NEXT: xor a6, a1, a6 +; RV32ZBB-NEXT: xor a4, a3, a4 +; RV32ZBB-NEXT: or a4, a4, a6 +; RV32ZBB-NEXT: beqz a4, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a4, t1 +; RV32ZBB-NEXT: mv a2, t1 ; RV32ZBB-NEXT: .LBB12_10: -; RV32ZBB-NEXT: neg t0, a4 -; RV32ZBB-NEXT: xor a5, t2, t0 -; RV32ZBB-NEXT: sltu t2, a5, t0 -; RV32ZBB-NEXT: xor t3, a6, t0 -; RV32ZBB-NEXT: add a3, t3, a4 -; RV32ZBB-NEXT: sub a3, a3, t2 -; RV32ZBB-NEXT: snez t1, a3 -; RV32ZBB-NEXT: add a5, a5, a4 -; RV32ZBB-NEXT: snez a7, a5 -; RV32ZBB-NEXT: or t1, a7, t1 -; RV32ZBB-NEXT: beqz a6, .LBB12_12 +; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: xor t0, a5, a4 +; RV32ZBB-NEXT: xor t3, a7, a4 +; RV32ZBB-NEXT: sltu a5, t0, a4 +; RV32ZBB-NEXT: add a6, t3, a2 +; RV32ZBB-NEXT: add t0, t0, a2 +; RV32ZBB-NEXT: sub t1, a6, a5 +; RV32ZBB-NEXT: snez a6, t1 +; RV32ZBB-NEXT: snez t2, t0 +; RV32ZBB-NEXT: or a6, t2, a6 +; RV32ZBB-NEXT: beqz a7, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t2, t3, t0 +; RV32ZBB-NEXT: sltu a5, t3, a4 ; RV32ZBB-NEXT: .LBB12_12: -; RV32ZBB-NEXT: xor a2, a2, t0 -; RV32ZBB-NEXT: add a6, a2, a4 -; RV32ZBB-NEXT: sub t3, a6, t2 -; RV32ZBB-NEXT: neg t4, t3 -; RV32ZBB-NEXT: sltu t5, t4, t1 -; RV32ZBB-NEXT: sltu a2, a2, t0 -; RV32ZBB-NEXT: xor a1, a1, t0 -; RV32ZBB-NEXT: add a1, a1, a4 -; RV32ZBB-NEXT: sub a1, a1, a2 -; RV32ZBB-NEXT: sltu a2, a6, t2 -; RV32ZBB-NEXT: sub a1, a1, a2 -; RV32ZBB-NEXT: snez a2, t3 +; RV32ZBB-NEXT: xor a3, a3, a4 +; RV32ZBB-NEXT: xor a1, a1, a4 +; RV32ZBB-NEXT: add t1, t1, t2 +; RV32ZBB-NEXT: neg a7, t0 +; RV32ZBB-NEXT: add t0, a3, a2 +; RV32ZBB-NEXT: sltu a3, a3, a4 ; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: neg a2, t1 +; RV32ZBB-NEXT: sub a4, t0, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sltu a3, t0, a5 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: snez a3, a4 +; RV32ZBB-NEXT: sltu a4, a5, a6 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: sub a3, a5, a6 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a1, a1, t5 -; RV32ZBB-NEXT: sub a2, t4, t1 -; RV32ZBB-NEXT: add a3, a3, a7 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: neg a4, a5 -; RV32ZBB-NEXT: sw a4, 0(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sw a7, 0(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) ; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; @@ -1068,13 +1068,13 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB12_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 ; RV64ZBB-NEXT: add a0, a2, a0 -; RV64ZBB-NEXT: snez a2, a0 -; RV64ZBB-NEXT: add a1, a1, a2 +; RV64ZBB-NEXT: sub a3, a3, a1 +; RV64ZBB-NEXT: snez a1, a0 +; RV64ZBB-NEXT: add a1, a3, a1 ; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: neg a0, a0 ; RV64ZBB-NEXT: ret @@ -1402,26 +1402,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: mv a5, t0 ; RV32I-NEXT: mv a4, a7 ; RV32I-NEXT: .LBB17_19: -; RV32I-NEXT: sltu a6, t3, a4 -; RV32I-NEXT: sub a7, t4, a5 -; RV32I-NEXT: sltu a5, a2, a1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: mv a7, a5 +; RV32I-NEXT: sltu a7, t3, a4 +; RV32I-NEXT: sub a5, t4, a5 +; RV32I-NEXT: sltu a6, a2, a1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beq t1, a3, .LBB17_21 ; RV32I-NEXT: # %bb.20: ; RV32I-NEXT: sltu a7, t1, a3 ; RV32I-NEXT: .LBB17_21: ; RV32I-NEXT: sub a4, t3, a4 -; RV32I-NEXT: sltu t0, a4, a7 -; RV32I-NEXT: sub a6, a6, t0 -; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sub a3, t1, a3 -; RV32I-NEXT: sub a3, a3, a5 ; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: sltu a1, a4, a7 +; RV32I-NEXT: sub a4, a4, a7 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a5, a5, a1 ; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a6, 12(a0) +; RV32I-NEXT: sw a5, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1529,26 +1529,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: mv a5, t0 ; RV32ZBB-NEXT: mv a4, a7 ; RV32ZBB-NEXT: .LBB17_19: -; RV32ZBB-NEXT: sltu a6, t3, a4 -; RV32ZBB-NEXT: sub a7, t4, a5 -; RV32ZBB-NEXT: sltu a5, a2, a1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: mv a7, a5 +; RV32ZBB-NEXT: sltu a7, t3, a4 +; RV32ZBB-NEXT: sub a5, t4, a5 +; RV32ZBB-NEXT: sltu a6, a2, a1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beq t1, a3, .LBB17_21 ; RV32ZBB-NEXT: # %bb.20: ; RV32ZBB-NEXT: sltu a7, t1, a3 ; RV32ZBB-NEXT: .LBB17_21: ; RV32ZBB-NEXT: sub a4, t3, a4 -; RV32ZBB-NEXT: sltu t0, a4, a7 -; RV32ZBB-NEXT: sub a6, a6, t0 -; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sub a3, t1, a3 -; RV32ZBB-NEXT: sub a3, a3, a5 ; RV32ZBB-NEXT: sub a2, a2, a1 +; RV32ZBB-NEXT: sltu a1, a4, a7 +; RV32ZBB-NEXT: sub a4, a4, a7 +; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a5, a5, a1 ; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a6, 12(a0) +; RV32ZBB-NEXT: sw a5, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1835,30 +1835,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t1, a5, a6 ; RV32I-NEXT: sub a7, a7, t0 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: sub a6, a5, a6 -; RV32I-NEXT: sltu a5, a6, t5 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a6, a6, t5 +; RV32I-NEXT: sub a5, a5, a6 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, a4, t4 +; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sltu a7, a5, t5 +; RV32I-NEXT: sub a1, a5, t5 +; RV32I-NEXT: sub a5, a4, t4 +; RV32I-NEXT: sub a4, a6, a7 ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: ; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a6, a6, a5 -; RV32I-NEXT: sub a5, a7, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a5, a5, a7 -; RV32I-NEXT: sub a6, a6, t3 -; RV32I-NEXT: sub a1, a1, t2 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a4, a1, a4 +; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sltu a7, a5, t3 +; RV32I-NEXT: sub a1, a5, t3 +; RV32I-NEXT: sub a5, a4, t2 +; RV32I-NEXT: sub a4, a6, a7 ; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: .LBB22_11: ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: @@ -1922,30 +1922,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t1, a5, a6 ; RV32ZBB-NEXT: sub a7, a7, t0 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: sub a6, a5, a6 -; RV32ZBB-NEXT: sltu a5, a6, t5 -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a6, a6, t5 +; RV32ZBB-NEXT: sub a5, a5, a6 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, a4, t4 +; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sltu a7, a5, t5 +; RV32ZBB-NEXT: sub a1, a5, t5 +; RV32ZBB-NEXT: sub a5, a4, t4 +; RV32ZBB-NEXT: sub a4, a6, a7 ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: ; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a6, a6, a5 -; RV32ZBB-NEXT: sub a5, a7, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a5, a5, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a4, a1, a4 +; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sltu a7, a5, t3 +; RV32ZBB-NEXT: sub a1, a5, t3 +; RV32ZBB-NEXT: sub a5, a4, t2 +; RV32ZBB-NEXT: sub a4, a6, a7 ; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: .LBB22_11: ; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) -; RV32ZBB-NEXT: sw a6, 8(a0) -; RV32ZBB-NEXT: sw a5, 12(a0) +; RV32ZBB-NEXT: sw a5, 4(a0) +; RV32ZBB-NEXT: sw a1, 8(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll index 814bca98523ce..7c8638cb461e2 100644 --- a/llvm/test/CodeGen/RISCV/abdu.ll +++ b/llvm/test/CodeGen/RISCV/abdu.ll @@ -178,8 +178,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i16_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -198,8 +198,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i16_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: zext.h a0, a0 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 ; RV64ZBB-NEXT: sub a0, a0, a2 @@ -267,8 +267,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -286,8 +286,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 @@ -317,8 +317,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; RV64I-LABEL: abd_ext_i32_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -337,8 +337,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i32_i16: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a0, a0, 32 -; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: zext.h a1, a1 +; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 ; RV64ZBB-NEXT: sub a0, a0, a2 @@ -365,8 +365,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_ext_i32_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -384,8 +384,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_ext_i32_undef: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 @@ -415,10 +415,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB9_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; @@ -447,10 +447,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB9_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -484,10 +484,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB10_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; @@ -516,10 +516,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB10_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -587,29 +587,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB11_10: ; RV32I-NEXT: neg t0, a1 ; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: sltu a4, a2, t0 ; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: add a6, a6, a1 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: xor a3, a3, t0 -; RV32I-NEXT: sltu a6, a3, t0 -; RV32I-NEXT: xor a7, a5, t0 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: xor a4, a3, t0 +; RV32I-NEXT: sltu a3, a2, t0 +; RV32I-NEXT: add a7, a6, a1 +; RV32I-NEXT: sltu a6, a4, t0 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: xor t1, a5, t0 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a5, .LBB11_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, t0 +; RV32I-NEXT: sltu a7, t1, t0 ; RV32I-NEXT: .LBB11_12: ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sltu a5, a2, t1 -; RV32I-NEXT: sub a4, a4, a5 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add t1, t1, a1 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: sltu a4, a2, a7 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a5, t1, a6 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_ext_i128: @@ -627,10 +627,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB11_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; @@ -683,29 +683,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB11_10: ; RV32ZBB-NEXT: neg t0, a1 ; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: sltu a4, a2, t0 ; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: add a6, a6, a1 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: xor a3, a3, t0 -; RV32ZBB-NEXT: sltu a6, a3, t0 -; RV32ZBB-NEXT: xor a7, a5, t0 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: xor a4, a3, t0 +; RV32ZBB-NEXT: sltu a3, a2, t0 +; RV32ZBB-NEXT: add a7, a6, a1 +; RV32ZBB-NEXT: sltu a6, a4, t0 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: xor t1, a5, t0 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beqz a5, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, t0 +; RV32ZBB-NEXT: sltu a7, t1, t0 ; RV32ZBB-NEXT: .LBB11_12: ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sltu a5, a2, t1 -; RV32ZBB-NEXT: sub a4, a4, a5 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add t1, t1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 +; RV32ZBB-NEXT: sltu a4, a2, a7 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a5, t1, a6 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_ext_i128: @@ -723,10 +723,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB11_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %aext = zext i128 %a to i256 @@ -787,29 +787,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB12_10: ; RV32I-NEXT: neg t0, a1 ; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: sltu a4, a2, t0 ; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: add a6, a6, a1 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: xor a3, a3, t0 -; RV32I-NEXT: sltu a6, a3, t0 -; RV32I-NEXT: xor a7, a5, t0 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: xor a4, a3, t0 +; RV32I-NEXT: sltu a3, a2, t0 +; RV32I-NEXT: add a7, a6, a1 +; RV32I-NEXT: sltu a6, a4, t0 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: xor t1, a5, t0 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a5, .LBB12_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, t0 +; RV32I-NEXT: sltu a7, t1, t0 ; RV32I-NEXT: .LBB12_12: ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sltu a5, a2, t1 -; RV32I-NEXT: sub a4, a4, a5 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add t1, t1, a1 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: sltu a4, a2, a7 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a5, t1, a6 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_ext_i128_undef: @@ -827,10 +827,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB12_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; @@ -883,29 +883,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB12_10: ; RV32ZBB-NEXT: neg t0, a1 ; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: sltu a4, a2, t0 ; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: add a6, a6, a1 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: xor a3, a3, t0 -; RV32ZBB-NEXT: sltu a6, a3, t0 -; RV32ZBB-NEXT: xor a7, a5, t0 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: xor a4, a3, t0 +; RV32ZBB-NEXT: sltu a3, a2, t0 +; RV32ZBB-NEXT: add a7, a6, a1 +; RV32ZBB-NEXT: sltu a6, a4, t0 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: xor t1, a5, t0 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beqz a5, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, t0 +; RV32ZBB-NEXT: sltu a7, t1, t0 ; RV32ZBB-NEXT: .LBB12_12: ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sltu a5, a2, t1 -; RV32ZBB-NEXT: sub a4, a4, a5 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add t1, t1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 +; RV32ZBB-NEXT: sltu a4, a2, a7 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a5, t1, a6 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_ext_i128_undef: @@ -923,10 +923,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB12_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %aext = zext i128 %a to i256 @@ -1029,8 +1029,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_minmax_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1048,8 +1048,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_minmax_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 @@ -1077,10 +1077,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB16_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; @@ -1109,10 +1109,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB16_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -1178,29 +1178,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB17_10: ; RV32I-NEXT: neg t0, a1 ; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: sltu a4, a2, t0 ; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: add a6, a6, a1 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: xor a3, a3, t0 -; RV32I-NEXT: sltu a6, a3, t0 -; RV32I-NEXT: xor a7, a5, t0 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: xor a4, a3, t0 +; RV32I-NEXT: sltu a3, a2, t0 +; RV32I-NEXT: add a7, a6, a1 +; RV32I-NEXT: sltu a6, a4, t0 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: xor t1, a5, t0 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a5, .LBB17_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, t0 +; RV32I-NEXT: sltu a7, t1, t0 ; RV32I-NEXT: .LBB17_12: ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sltu a5, a2, t1 -; RV32I-NEXT: sub a4, a4, a5 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add t1, t1, a1 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: sltu a4, a2, a7 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a5, t1, a6 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_minmax_i128: @@ -1218,10 +1218,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB17_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; @@ -1274,29 +1274,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB17_10: ; RV32ZBB-NEXT: neg t0, a1 ; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: sltu a4, a2, t0 ; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: add a6, a6, a1 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: xor a3, a3, t0 -; RV32ZBB-NEXT: sltu a6, a3, t0 -; RV32ZBB-NEXT: xor a7, a5, t0 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: xor a4, a3, t0 +; RV32ZBB-NEXT: sltu a3, a2, t0 +; RV32ZBB-NEXT: add a7, a6, a1 +; RV32ZBB-NEXT: sltu a6, a4, t0 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: xor t1, a5, t0 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beqz a5, .LBB17_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, t0 +; RV32ZBB-NEXT: sltu a7, t1, t0 ; RV32ZBB-NEXT: .LBB17_12: ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sltu a5, a2, t1 -; RV32ZBB-NEXT: sub a4, a4, a5 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add t1, t1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 +; RV32ZBB-NEXT: sltu a4, a2, a7 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a5, t1, a6 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_minmax_i128: @@ -1314,10 +1314,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB17_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %min = call i128 @llvm.umin.i128(i128 %a, i128 %b) @@ -1420,8 +1420,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_cmp_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1439,8 +1439,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_cmp_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 @@ -1469,10 +1469,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB21_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; @@ -1501,10 +1501,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB21_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -1571,29 +1571,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB22_10: ; RV32I-NEXT: neg t0, a1 ; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: sltu a4, a2, t0 ; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: add a6, a6, a1 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: xor a3, a3, t0 -; RV32I-NEXT: sltu a6, a3, t0 -; RV32I-NEXT: xor a7, a5, t0 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: xor a4, a3, t0 +; RV32I-NEXT: sltu a3, a2, t0 +; RV32I-NEXT: add a7, a6, a1 +; RV32I-NEXT: sltu a6, a4, t0 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: xor t1, a5, t0 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a5, .LBB22_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, t0 +; RV32I-NEXT: sltu a7, t1, t0 ; RV32I-NEXT: .LBB22_12: ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sltu a5, a2, t1 -; RV32I-NEXT: sub a4, a4, a5 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add t1, t1, a1 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: sltu a4, a2, a7 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a5, t1, a6 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: @@ -1611,10 +1611,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB22_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; @@ -1667,29 +1667,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB22_10: ; RV32ZBB-NEXT: neg t0, a1 ; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: sltu a4, a2, t0 ; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: add a6, a6, a1 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: xor a3, a3, t0 -; RV32ZBB-NEXT: sltu a6, a3, t0 -; RV32ZBB-NEXT: xor a7, a5, t0 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: xor a4, a3, t0 +; RV32ZBB-NEXT: sltu a3, a2, t0 +; RV32ZBB-NEXT: add a7, a6, a1 +; RV32ZBB-NEXT: sltu a6, a4, t0 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: xor t1, a5, t0 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beqz a5, .LBB22_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, t0 +; RV32ZBB-NEXT: sltu a7, t1, t0 ; RV32ZBB-NEXT: .LBB22_12: ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sltu a5, a2, t1 -; RV32ZBB-NEXT: sub a4, a4, a5 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add t1, t1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 +; RV32ZBB-NEXT: sltu a4, a2, a7 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a5, t1, a6 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: @@ -1707,10 +1707,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB22_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %cmp = icmp uge i128 %a, %b @@ -1814,8 +1814,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: abd_select_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srai a1, a0, 63 @@ -1833,8 +1833,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { ; RV64ZBB-LABEL: abd_select_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: slli a1, a1, 32 -; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 ; RV64ZBB-NEXT: srli a0, a0, 32 ; RV64ZBB-NEXT: minu a2, a0, a1 ; RV64ZBB-NEXT: maxu a0, a0, a1 @@ -1863,10 +1863,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: .LBB26_3: ; RV32I-NEXT: neg a1, a0 ; RV32I-NEXT: xor a2, a2, a1 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: xor a1, a3, a1 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: xor a3, a3, a1 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; @@ -1895,10 +1895,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { ; RV32ZBB-NEXT: .LBB26_3: ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: xor a2, a2, a1 -; RV32ZBB-NEXT: sltu a4, a2, a1 -; RV32ZBB-NEXT: xor a1, a3, a1 -; RV32ZBB-NEXT: add a1, a1, a0 -; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: xor a3, a3, a1 +; RV32ZBB-NEXT: sltu a1, a2, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -1965,29 +1965,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB27_10: ; RV32I-NEXT: neg t0, a1 ; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: sltu a4, a2, t0 ; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: add a6, a6, a1 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: xor a3, a3, t0 -; RV32I-NEXT: sltu a6, a3, t0 -; RV32I-NEXT: xor a7, a5, t0 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: xor a4, a3, t0 +; RV32I-NEXT: sltu a3, a2, t0 +; RV32I-NEXT: add a7, a6, a1 +; RV32I-NEXT: sltu a6, a4, t0 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: xor t1, a5, t0 +; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a5, .LBB27_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t1, a7, t0 +; RV32I-NEXT: sltu a7, t1, t0 ; RV32I-NEXT: .LBB27_12: ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: sltu a5, a2, t1 -; RV32I-NEXT: sub a4, a4, a5 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: add a7, a7, a1 -; RV32I-NEXT: sub a5, a7, a6 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add t1, t1, a1 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: sltu a4, a2, a7 +; RV32I-NEXT: sub a2, a2, a7 +; RV32I-NEXT: sub a5, t1, a6 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_select_i128: @@ -2005,10 +2005,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: .LBB27_3: ; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: xor a2, a2, a1 -; RV64I-NEXT: sltu a4, a2, a1 -; RV64I-NEXT: xor a1, a3, a1 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: xor a3, a3, a1 +; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; @@ -2061,29 +2061,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB27_10: ; RV32ZBB-NEXT: neg t0, a1 ; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: sltu a4, a2, t0 ; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: add a6, a6, a1 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: xor a3, a3, t0 -; RV32ZBB-NEXT: sltu a6, a3, t0 -; RV32ZBB-NEXT: xor a7, a5, t0 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: xor a4, a3, t0 +; RV32ZBB-NEXT: sltu a3, a2, t0 +; RV32ZBB-NEXT: add a7, a6, a1 +; RV32ZBB-NEXT: sltu a6, a4, t0 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: xor t1, a5, t0 +; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beqz a5, .LBB27_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t1, a7, t0 +; RV32ZBB-NEXT: sltu a7, t1, t0 ; RV32ZBB-NEXT: .LBB27_12: ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: sltu a5, a2, t1 -; RV32ZBB-NEXT: sub a4, a4, a5 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: add a7, a7, a1 -; RV32ZBB-NEXT: sub a5, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add t1, t1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 +; RV32ZBB-NEXT: sltu a4, a2, a7 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a5, t1, a6 +; RV32ZBB-NEXT: sub a3, a3, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) ; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_select_i128: @@ -2101,10 +2101,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV64ZBB-NEXT: .LBB27_3: ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: xor a2, a2, a1 -; RV64ZBB-NEXT: sltu a4, a2, a1 -; RV64ZBB-NEXT: xor a1, a3, a1 -; RV64ZBB-NEXT: add a1, a1, a0 -; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: xor a3, a3, a1 +; RV64ZBB-NEXT: sltu a1, a2, a1 +; RV64ZBB-NEXT: add a3, a3, a0 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %cmp = icmp ult i128 %a, %b diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll index db7498340d395..5d4478f9d4b5f 100644 --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -171,16 +171,16 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: srli a5, a2, 29 ; RV32I-NEXT: slli a6, a3, 3 -; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: srli a3, a3, 29 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a6, a4, 3 ; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lui a6, 128 ; RV32I-NEXT: srli a4, a4, 29 ; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a2, a2, 3 -; RV32I-NEXT: lui a4, 128 -; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: add a1, a1, a6 ; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: sw a3, 8(a0) @@ -191,8 +191,8 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a2, a0, 61 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: addi a2, zero, 1 ; RV64I-NEXT: slli a2, a2, 51 ; RV64I-NEXT: add a1, a1, a2 @@ -200,23 +200,23 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; ; RV32C-LABEL: add_wide_operand: ; RV32C: # %bb.0: -; RV32C-NEXT: c.lw a2, 12(a1) -; RV32C-NEXT: lw a6, 0(a1) -; RV32C-NEXT: c.lw a3, 4(a1) +; RV32C-NEXT: c.lw a4, 12(a1) +; RV32C-NEXT: c.lw a3, 0(a1) +; RV32C-NEXT: c.lw a2, 4(a1) ; RV32C-NEXT: c.lw a1, 8(a1) ; RV32C-NEXT: c.lui a5, 16 -; RV32C-NEXT: c.add a2, a5 -; RV32C-NEXT: c.slli a2, 3 -; RV32C-NEXT: srli a5, a1, 29 -; RV32C-NEXT: c.or a2, a5 -; RV32C-NEXT: srli a5, a6, 29 -; RV32C-NEXT: slli a4, a3, 3 +; RV32C-NEXT: add a6, a4, a5 +; RV32C-NEXT: srli a5, a3, 29 +; RV32C-NEXT: slli a4, a2, 3 ; RV32C-NEXT: c.or a4, a5 -; RV32C-NEXT: c.srli a3, 29 +; RV32C-NEXT: srli a5, a1, 29 +; RV32C-NEXT: c.srli a2, 29 ; RV32C-NEXT: c.slli a1, 3 -; RV32C-NEXT: c.or a1, a3 +; RV32C-NEXT: c.slli a3, 3 ; RV32C-NEXT: c.slli a6, 3 -; RV32C-NEXT: sw a6, 0(a0) +; RV32C-NEXT: c.or a1, a2 +; RV32C-NEXT: or a2, a6, a5 +; RV32C-NEXT: c.sw a3, 0(a0) ; RV32C-NEXT: c.sw a4, 4(a0) ; RV32C-NEXT: c.sw a1, 8(a0) ; RV32C-NEXT: c.sw a2, 12(a0) @@ -226,8 +226,8 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; RV64C: # %bb.0: ; RV64C-NEXT: srli a2, a0, 61 ; RV64C-NEXT: c.slli a1, 3 -; RV64C-NEXT: c.or a1, a2 ; RV64C-NEXT: c.slli a0, 3 +; RV64C-NEXT: c.or a1, a2 ; RV64C-NEXT: c.li a2, 1 ; RV64C-NEXT: c.slli a2, 51 ; RV64C-NEXT: c.add a1, a2 diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll index 52751f1c22421..84deb4c00ac8d 100644 --- a/llvm/test/CodeGen/RISCV/add-imm.ll +++ b/llvm/test/CodeGen/RISCV/add-imm.ll @@ -213,29 +213,29 @@ define void @add32_reject() nounwind { ; RV32I-LABEL: add32_reject: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, %hi(ga) -; RV32I-NEXT: lw a1, %lo(ga)(a0) -; RV32I-NEXT: lui a2, %hi(gb) -; RV32I-NEXT: lw a3, %lo(gb)(a2) +; RV32I-NEXT: lui a1, %hi(gb) +; RV32I-NEXT: lw a2, %lo(ga)(a0) +; RV32I-NEXT: lw a3, %lo(gb)(a1) ; RV32I-NEXT: lui a4, 1 ; RV32I-NEXT: addi a4, a4, -1096 -; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: add a3, a3, a4 -; RV32I-NEXT: sw a1, %lo(ga)(a0) -; RV32I-NEXT: sw a3, %lo(gb)(a2) +; RV32I-NEXT: sw a2, %lo(ga)(a0) +; RV32I-NEXT: sw a3, %lo(gb)(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: add32_reject: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, %hi(ga) -; RV64I-NEXT: lw a1, %lo(ga)(a0) -; RV64I-NEXT: lui a2, %hi(gb) -; RV64I-NEXT: lw a3, %lo(gb)(a2) +; RV64I-NEXT: lui a1, %hi(gb) +; RV64I-NEXT: lw a2, %lo(ga)(a0) +; RV64I-NEXT: lw a3, %lo(gb)(a1) ; RV64I-NEXT: lui a4, 1 ; RV64I-NEXT: addi a4, a4, -1096 -; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a3, a3, a4 -; RV64I-NEXT: sw a1, %lo(ga)(a0) -; RV64I-NEXT: sw a3, %lo(gb)(a2) +; RV64I-NEXT: sw a2, %lo(ga)(a0) +; RV64I-NEXT: sw a3, %lo(gb)(a1) ; RV64I-NEXT: ret %1 = load i32, ptr @ga, align 4 %2 = load i32, ptr @gb, align 4 diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll index 3a4163a8bb50f..ff0d1e75c746c 100644 --- a/llvm/test/CodeGen/RISCV/addcarry.ll +++ b/llvm/test/CodeGen/RISCV/addcarry.ll @@ -12,16 +12,16 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind { ; RISCV32: # %bb.0: ; RISCV32-NEXT: mul a4, a0, a3 ; RISCV32-NEXT: mulhu a5, a0, a2 -; RISCV32-NEXT: add a6, a5, a4 -; RISCV32-NEXT: mul a4, a1, a2 -; RISCV32-NEXT: add a4, a6, a4 -; RISCV32-NEXT: sltu a7, a4, a6 -; RISCV32-NEXT: sltu a5, a6, a5 -; RISCV32-NEXT: mulhu a6, a0, a3 +; RISCV32-NEXT: mul a6, a1, a2 +; RISCV32-NEXT: mulhu a7, a0, a3 ; RISCV32-NEXT: mulhu t0, a1, a2 -; RISCV32-NEXT: add a6, a6, t0 -; RISCV32-NEXT: add a5, a6, a5 -; RISCV32-NEXT: add a5, a5, a7 +; RISCV32-NEXT: add t1, a5, a4 +; RISCV32-NEXT: add a7, a7, t0 +; RISCV32-NEXT: add a4, t1, a6 +; RISCV32-NEXT: sltu a5, t1, a5 +; RISCV32-NEXT: sltu a6, a4, t1 +; RISCV32-NEXT: add a5, a7, a5 +; RISCV32-NEXT: add a5, a5, a6 ; RISCV32-NEXT: mul a6, a1, a3 ; RISCV32-NEXT: add a5, a5, a6 ; RISCV32-NEXT: bgez a1, .LBB0_2 @@ -34,9 +34,9 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind { ; RISCV32-NEXT: .LBB0_4: ; RISCV32-NEXT: slli a5, a5, 30 ; RISCV32-NEXT: srli a1, a4, 2 -; RISCV32-NEXT: or a1, a5, a1 ; RISCV32-NEXT: slli a4, a4, 30 ; RISCV32-NEXT: mul a0, a0, a2 +; RISCV32-NEXT: or a1, a5, a1 ; RISCV32-NEXT: srli a0, a0, 2 ; RISCV32-NEXT: or a0, a4, a0 ; RISCV32-NEXT: ret @@ -49,8 +49,8 @@ define { i32, i32, i1 } @addcarry_2x32(i32 %x0, i32 %x1, i32 %y0, i32 %y1) nounw ; RISCV32-LABEL: addcarry_2x32: ; RISCV32: # %bb.0: ; RISCV32-NEXT: add a3, a1, a3 -; RISCV32-NEXT: sltu a1, a3, a1 ; RISCV32-NEXT: add a4, a2, a4 +; RISCV32-NEXT: sltu a1, a3, a1 ; RISCV32-NEXT: sltu a2, a4, a2 ; RISCV32-NEXT: add a1, a4, a1 ; RISCV32-NEXT: sltu a4, a1, a4 diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll index a18526718461e..8e445511b6119 100644 --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -53,16 +53,16 @@ define i64 @add_mul_combine_accept_a3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_accept_a3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 29 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a3, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 5 ; RV32IMB-NEXT: sub a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh1add a2, a0, a0 +; RV32IMB-NEXT: sh1add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: slli a0, a0, 5 -; RV32IMB-NEXT: sub a2, a0, a2 -; RV32IMB-NEXT: addi a0, a2, 1073 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: sub a3, a0, a3 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: addi a0, a3, 1073 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -132,18 +132,18 @@ define i64 @add_mul_combine_accept_b3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_accept_b3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 23 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh3add a3, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 5 ; RV32IMB-NEXT: sub a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh3add a2, a0, a0 +; RV32IMB-NEXT: sh3add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: slli a0, a0, 5 -; RV32IMB-NEXT: sub a2, a0, a2 +; RV32IMB-NEXT: sub a3, a0, a3 ; RV32IMB-NEXT: lui a0, 50 ; RV32IMB-NEXT: addi a0, a0, 1119 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -207,18 +207,18 @@ define i64 @add_mul_combine_reject_a3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_a3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 29 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a3, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 5 ; RV32IMB-NEXT: sub a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh1add a2, a0, a0 +; RV32IMB-NEXT: sh1add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: slli a0, a0, 5 -; RV32IMB-NEXT: sub a2, a0, a2 +; RV32IMB-NEXT: sub a3, a0, a3 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -185 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -278,16 +278,16 @@ define i64 @add_mul_combine_reject_c3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_c3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 73 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh3add a3, a1, a1 ; RV32IMB-NEXT: sh3add a1, a3, a1 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh3add a2, a0, a0 -; RV32IMB-NEXT: sh3add a2, a2, a0 +; RV32IMB-NEXT: sh3add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 +; RV32IMB-NEXT: sh3add a3, a3, a0 ; RV32IMB-NEXT: lui a0, 18 ; RV32IMB-NEXT: addi a0, a0, -728 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -344,16 +344,16 @@ define i64 @add_mul_combine_reject_d3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_d3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 192 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a1, a1, a1 +; RV32IMB-NEXT: mulhu a2, a0, a2 +; RV32IMB-NEXT: sh1add a0, a0, a0 ; RV32IMB-NEXT: slli a1, a1, 6 ; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh1add a0, a0, a0 -; RV32IMB-NEXT: slli a2, a0, 6 -; RV32IMB-NEXT: lui a0, 47 -; RV32IMB-NEXT: addi a0, a0, -512 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: lui a2, 47 +; RV32IMB-NEXT: slli a3, a0, 6 +; RV32IMB-NEXT: addi a0, a2, -512 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -414,18 +414,18 @@ define i64 @add_mul_combine_reject_e3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_e3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 29 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a3, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 5 ; RV32IMB-NEXT: sub a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh1add a2, a0, a0 +; RV32IMB-NEXT: sh1add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: slli a0, a0, 5 -; RV32IMB-NEXT: sub a2, a0, a2 +; RV32IMB-NEXT: sub a3, a0, a3 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -185 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -491,18 +491,18 @@ define i64 @add_mul_combine_reject_f3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_f3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 29 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a3, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 5 ; RV32IMB-NEXT: sub a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh1add a2, a0, a0 +; RV32IMB-NEXT: sh1add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: slli a0, a0, 5 -; RV32IMB-NEXT: sub a2, a0, a2 +; RV32IMB-NEXT: sub a3, a0, a3 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -145 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -565,16 +565,16 @@ define i64 @add_mul_combine_reject_g3(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_reject_g3: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 73 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh3add a3, a1, a1 ; RV32IMB-NEXT: sh3add a1, a3, a1 -; RV32IMB-NEXT: add a1, a2, a1 -; RV32IMB-NEXT: sh3add a2, a0, a0 -; RV32IMB-NEXT: sh3add a2, a2, a0 +; RV32IMB-NEXT: sh3add a3, a0, a0 +; RV32IMB-NEXT: mulhu a2, a0, a2 +; RV32IMB-NEXT: sh3add a3, a3, a0 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, -882 -; RV32IMB-NEXT: add a0, a2, a0 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a3, a0 +; RV32IMB-NEXT: sltu a2, a0, a3 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -595,15 +595,15 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) { ; RV32IMB-LABEL: add_mul_combine_infinite_loop: ; RV32IMB: # %bb.0: ; RV32IMB-NEXT: li a2, 24 -; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a1, a1, a1 -; RV32IMB-NEXT: sh3add a1, a1, a2 -; RV32IMB-NEXT: sh1add a0, a0, a0 -; RV32IMB-NEXT: slli a2, a0, 3 -; RV32IMB-NEXT: li a3, 1 -; RV32IMB-NEXT: slli a3, a3, 11 -; RV32IMB-NEXT: sh3add a0, a0, a3 -; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: sh1add a3, a0, a0 +; RV32IMB-NEXT: mulhu a0, a0, a2 +; RV32IMB-NEXT: li a2, 1 +; RV32IMB-NEXT: sh3add a1, a1, a0 +; RV32IMB-NEXT: slli a4, a3, 3 +; RV32IMB-NEXT: slli a2, a2, 11 +; RV32IMB-NEXT: sh3add a0, a3, a2 +; RV32IMB-NEXT: sltu a2, a0, a4 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; @@ -672,10 +672,10 @@ define i64 @mul3000_add8990_c(i64 %x) { ; RV32IMB-NEXT: addi a2, a2, -1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, 798 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 @@ -747,10 +747,10 @@ define i64 @mul3000_sub8990_c(i64 %x) { ; RV32IMB-NEXT: addi a2, a2, -1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 1048574 ; RV32IMB-NEXT: addi a0, a0, -798 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 @@ -823,12 +823,12 @@ define i64 @mulneg3000_add8990_c(i64 %x) { ; RV32IMB-NEXT: addi a2, a2, 1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: sub a3, a3, a0 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 +; RV32IMB-NEXT: sub a3, a3, a0 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, 798 ; RV32IMB-NEXT: add a0, a2, a0 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret @@ -899,12 +899,12 @@ define i64 @mulneg3000_sub8990_c(i64 %x) { ; RV32IMB-NEXT: addi a2, a2, 1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: sub a3, a3, a0 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 +; RV32IMB-NEXT: sub a3, a3, a0 ; RV32IMB-NEXT: lui a0, 1048574 ; RV32IMB-NEXT: addi a0, a0, -798 ; RV32IMB-NEXT: add a0, a2, a0 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: addi a1, a1, -1 diff --git a/llvm/test/CodeGen/RISCV/alu16.ll b/llvm/test/CodeGen/RISCV/alu16.ll index cb28ccdda0a54..41f26526ef03e 100644 --- a/llvm/test/CodeGen/RISCV/alu16.ll +++ b/llvm/test/CodeGen/RISCV/alu16.ll @@ -254,8 +254,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: slt: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: slt a0, a0, a1 ; RV32I-NEXT: ret @@ -263,8 +263,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind { ; RV64I-LABEL: slt: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slt a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/alu8.ll b/llvm/test/CodeGen/RISCV/alu8.ll index d563525be7a35..6ae96e7c9deae 100644 --- a/llvm/test/CodeGen/RISCV/alu8.ll +++ b/llvm/test/CodeGen/RISCV/alu8.ll @@ -252,8 +252,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: slt: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: slt a0, a0, a1 ; RV32I-NEXT: ret @@ -261,8 +261,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind { ; RV64I-LABEL: slt: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slt a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/and.ll b/llvm/test/CodeGen/RISCV/and.ll index 79e3b954c50d8..31c63c7f9b18f 100644 --- a/llvm/test/CodeGen/RISCV/and.ll +++ b/llvm/test/CodeGen/RISCV/and.ll @@ -124,8 +124,8 @@ define i64 @and64_0x7ffffffffffff000(i64 %x) { ; RV32I-LABEL: and64_0x7ffffffffffff000: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 1048575 -; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll index 234a956be809e..741860db13957 100644 --- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll +++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll @@ -99,10 +99,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 ; RV32IA-NEXT: li a0, 255 -; RV32IA-NEXT: sll a0, a0, a4 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a4 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a0, a0, a4 +; RV32IA-NEXT: sll a1, a1, a4 ; RV32IA-NEXT: sll a2, a2, a4 ; RV32IA-NEXT: .LBB2_1: # %do_cmpxchg ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -129,10 +129,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV32IA-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-ZACAS-NEXT: slli a4, a0, 3 ; RV32IA-ZACAS-NEXT: li a0, 255 -; RV32IA-ZACAS-NEXT: sll a0, a0, a4 ; RV32IA-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-ZACAS-NEXT: sll a1, a1, a4 ; RV32IA-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-ZACAS-NEXT: sll a0, a0, a4 +; RV32IA-ZACAS-NEXT: sll a1, a1, a4 ; RV32IA-ZACAS-NEXT: sll a2, a2, a4 ; RV32IA-ZACAS-NEXT: .LBB2_1: # %do_cmpxchg ; RV32IA-ZACAS-NEXT: # =>This Loop Header: Depth=1 @@ -159,10 +159,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV64IA-NEXT: andi a3, a0, -4 ; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: li a0, 255 -; RV64IA-NEXT: sllw a0, a0, a4 ; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a4 ; RV64IA-NEXT: andi a2, a2, 255 +; RV64IA-NEXT: sllw a0, a0, a4 +; RV64IA-NEXT: sllw a1, a1, a4 ; RV64IA-NEXT: sllw a2, a2, a4 ; RV64IA-NEXT: .LBB2_1: # %do_cmpxchg ; RV64IA-NEXT: # =>This Loop Header: Depth=1 @@ -189,10 +189,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a4, a0, 3 ; RV64IA-ZACAS-NEXT: li a0, 255 -; RV64IA-ZACAS-NEXT: sllw a0, a0, a4 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a4 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a0, a0, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a4 ; RV64IA-ZACAS-NEXT: sllw a2, a2, a4 ; RV64IA-ZACAS-NEXT: .LBB2_1: # %do_cmpxchg ; RV64IA-ZACAS-NEXT: # =>This Loop Header: Depth=1 @@ -240,10 +240,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 ; RV32IA-NEXT: li a0, 255 -; RV32IA-NEXT: sll a0, a0, a4 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a4 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a0, a0, a4 +; RV32IA-NEXT: sll a1, a1, a4 ; RV32IA-NEXT: sll a2, a2, a4 ; RV32IA-NEXT: .LBB3_1: # %do_cmpxchg ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -273,10 +273,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV32IA-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-ZACAS-NEXT: slli a4, a0, 3 ; RV32IA-ZACAS-NEXT: li a0, 255 -; RV32IA-ZACAS-NEXT: sll a0, a0, a4 ; RV32IA-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-ZACAS-NEXT: sll a1, a1, a4 ; RV32IA-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-ZACAS-NEXT: sll a0, a0, a4 +; RV32IA-ZACAS-NEXT: sll a1, a1, a4 ; RV32IA-ZACAS-NEXT: sll a2, a2, a4 ; RV32IA-ZACAS-NEXT: .LBB3_1: # %do_cmpxchg ; RV32IA-ZACAS-NEXT: # =>This Loop Header: Depth=1 @@ -306,10 +306,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV64IA-NEXT: andi a3, a0, -4 ; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: li a0, 255 -; RV64IA-NEXT: sllw a0, a0, a4 ; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a4 ; RV64IA-NEXT: andi a2, a2, 255 +; RV64IA-NEXT: sllw a0, a0, a4 +; RV64IA-NEXT: sllw a1, a1, a4 ; RV64IA-NEXT: sllw a2, a2, a4 ; RV64IA-NEXT: .LBB3_1: # %do_cmpxchg ; RV64IA-NEXT: # =>This Loop Header: Depth=1 @@ -339,10 +339,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a4, a0, 3 ; RV64IA-ZACAS-NEXT: li a0, 255 -; RV64IA-ZACAS-NEXT: sllw a0, a0, a4 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a4 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a0, a0, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a4 ; RV64IA-ZACAS-NEXT: sllw a2, a2, a4 ; RV64IA-ZACAS-NEXT: .LBB3_1: # %do_cmpxchg ; RV64IA-ZACAS-NEXT: # =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll index a87b49e61a8db..c3b972840377f 100644 --- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll @@ -43,10 +43,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a2, (a3) @@ -79,10 +79,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w a2, (a3) @@ -102,10 +102,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a2, (a3) @@ -130,10 +130,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -170,10 +170,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -193,10 +193,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -216,10 +216,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -239,10 +239,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -275,10 +275,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -298,10 +298,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -326,10 +326,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -349,10 +349,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -394,10 +394,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -417,10 +417,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -440,10 +440,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -463,10 +463,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -499,10 +499,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -522,10 +522,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -550,10 +550,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -573,10 +573,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -618,10 +618,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a2, (a3) @@ -641,10 +641,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w a2, (a3) @@ -664,10 +664,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -687,10 +687,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -723,10 +723,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w a2, (a3) @@ -746,10 +746,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a2, (a3) @@ -774,10 +774,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -797,10 +797,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -842,10 +842,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -865,10 +865,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -888,10 +888,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -911,10 +911,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -947,10 +947,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -970,10 +970,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -998,10 +998,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -1021,10 +1021,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -1066,10 +1066,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -1089,10 +1089,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -1112,10 +1112,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -1135,10 +1135,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -1171,10 +1171,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -1194,10 +1194,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -1222,10 +1222,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -1245,10 +1245,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -1290,10 +1290,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-NEXT: andi a3, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: andi a2, a2, 255 +; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -1313,10 +1313,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-WMO-ZACAS-NEXT: li a4, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-WMO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -1336,10 +1336,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-NEXT: andi a3, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: andi a2, a2, 255 +; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -1359,10 +1359,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV32IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV32IA-TSO-ZACAS-NEXT: li a4, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV32IA-TSO-ZACAS-NEXT: sll a4, a4, a0 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -1395,10 +1395,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -1418,10 +1418,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -1446,10 +1446,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -1469,10 +1469,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -1514,10 +1514,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -1550,10 +1550,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -1573,10 +1573,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -1601,10 +1601,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) @@ -1646,10 +1646,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -1682,10 +1682,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -1705,10 +1705,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -1733,10 +1733,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB8_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) @@ -1778,10 +1778,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -1814,10 +1814,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-WMO-NEXT: andi a3, a0, -4 ; RV64IA-WMO-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NEXT: li a4, 255 -; RV64IA-WMO-NEXT: sllw a4, a4, a0 ; RV64IA-WMO-NEXT: andi a1, a1, 255 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: andi a2, a2, 255 +; RV64IA-WMO-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -1837,10 +1837,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-ZACAS-NEXT: andi a3, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: andi a2, a2, 255 +; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -1866,10 +1866,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind { ; RV64IA-TSO-NEXT: andi a3, a0, -4 ; RV64IA-TSO-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NEXT: li a4, 255 -; RV64IA-TSO-NEXT: sllw a4, a4, a0 ; RV64IA-TSO-NEXT: andi a1, a1, 255 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: andi a2, a2, 255 +; RV64IA-TSO-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) @@ -1915,8 +1915,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a2, (a3) @@ -1952,8 +1952,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w a2, (a3) @@ -1976,8 +1976,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw ; RV64IA-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a2, (a3) @@ -2005,8 +2005,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -2046,8 +2046,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2070,8 +2070,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2094,8 +2094,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -2118,8 +2118,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2155,8 +2155,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2179,8 +2179,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2208,8 +2208,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -2232,8 +2232,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2278,8 +2278,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2302,8 +2302,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2326,8 +2326,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -2350,8 +2350,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2387,8 +2387,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2411,8 +2411,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2440,8 +2440,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -2464,8 +2464,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2510,8 +2510,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a2, (a3) @@ -2534,8 +2534,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w a2, (a3) @@ -2558,8 +2558,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -2582,8 +2582,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2619,8 +2619,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w a2, (a3) @@ -2643,8 +2643,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a2, (a3) @@ -2672,8 +2672,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -2696,8 +2696,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2742,8 +2742,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2766,8 +2766,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2790,8 +2790,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -2814,8 +2814,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2851,8 +2851,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2875,8 +2875,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -2904,8 +2904,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -2928,8 +2928,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB14_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -2974,8 +2974,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -2998,8 +2998,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -3022,8 +3022,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -3046,8 +3046,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -3083,8 +3083,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -3107,8 +3107,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -3136,8 +3136,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -3160,8 +3160,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -3206,8 +3206,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-NEXT: addi a4, a4, -1 ; RV32IA-WMO-NEXT: sll a5, a4, a0 ; RV32IA-WMO-NEXT: and a1, a1, a4 -; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: and a2, a2, a4 +; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: sll a0, a2, a0 ; RV32IA-WMO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -3230,8 +3230,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-WMO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-WMO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-WMO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-WMO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -3254,8 +3254,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-NEXT: addi a4, a4, -1 ; RV32IA-TSO-NEXT: sll a5, a4, a0 ; RV32IA-TSO-NEXT: and a1, a1, a4 -; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: and a2, a2, a4 +; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: sll a0, a2, a0 ; RV32IA-TSO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a2, (a3) @@ -3278,8 +3278,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV32IA-TSO-ZACAS-NEXT: sll a5, a4, a0 ; RV32IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV32IA-TSO-ZACAS-NEXT: sll a1, a1, a0 ; RV32IA-TSO-ZACAS-NEXT: sll a0, a2, a0 ; RV32IA-TSO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -3315,8 +3315,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aq a2, (a3) @@ -3339,8 +3339,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a2, (a3) @@ -3368,8 +3368,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w a2, (a3) @@ -3392,8 +3392,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a2, (a3) @@ -3438,8 +3438,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -3475,8 +3475,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -3499,8 +3499,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -3528,8 +3528,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) @@ -3574,8 +3574,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -3611,8 +3611,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -3635,8 +3635,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -3664,8 +3664,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) @@ -3710,8 +3710,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a2, (a3) @@ -3747,8 +3747,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-WMO-NEXT: addi a4, a4, -1 ; RV64IA-WMO-NEXT: sllw a5, a4, a0 ; RV64IA-WMO-NEXT: and a1, a1, a4 -; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: and a2, a2, a4 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NEXT: sllw a0, a2, a0 ; RV64IA-WMO-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NEXT: lr.w.aqrl a2, (a3) @@ -3771,8 +3771,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-ZACAS-NEXT: addi a4, a4, -1 ; RV64IA-ZACAS-NEXT: sllw a5, a4, a0 ; RV64IA-ZACAS-NEXT: and a1, a1, a4 -; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: and a2, a2, a4 +; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: sllw a0, a2, a0 ; RV64IA-ZACAS-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a2, (a3) @@ -3801,8 +3801,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind ; RV64IA-TSO-NEXT: addi a4, a4, -1 ; RV64IA-TSO-NEXT: sllw a5, a4, a0 ; RV64IA-TSO-NEXT: and a1, a1, a4 -; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: and a2, a2, a4 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NEXT: sllw a0, a2, a0 ; RV64IA-TSO-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NEXT: lr.w.aqrl a2, (a3) diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll index 35a1227b86b3a..8534ad379ebab 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll @@ -191,10 +191,10 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a0 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a0 ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB11_2 ; RV32-NEXT: .LBB11_1: # %atomicrmw.start @@ -204,17 +204,17 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s0 +; RV32-NEXT: mv a0, s1 ; RV32-NEXT: call __atomic_compare_exchange_8 ; RV32-NEXT: lw a4, 8(sp) ; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB11_6 ; RV32-NEXT: .LBB11_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s1, .LBB11_4 +; RV32-NEXT: beq a5, s0, .LBB11_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: slt a0, s1, a5 +; RV32-NEXT: slt a0, s0, a5 ; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB11_1 @@ -227,7 +227,7 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: .LBB11_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 ; RV32-NEXT: mv a2, s2 -; RV32-NEXT: mv a3, s1 +; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB11_1 ; RV32-NEXT: .LBB11_6: # %atomicrmw.end ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -267,10 +267,10 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a0 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a0 ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB13_2 ; RV32-NEXT: .LBB13_1: # %atomicrmw.start @@ -280,17 +280,17 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s0 +; RV32-NEXT: mv a0, s1 ; RV32-NEXT: call __atomic_compare_exchange_8 ; RV32-NEXT: lw a4, 8(sp) ; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB13_6 ; RV32-NEXT: .LBB13_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s1, .LBB13_4 +; RV32-NEXT: beq a5, s0, .LBB13_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sltu a0, s1, a5 +; RV32-NEXT: sltu a0, s0, a5 ; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB13_1 @@ -303,7 +303,7 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: .LBB13_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 ; RV32-NEXT: mv a2, s2 -; RV32-NEXT: mv a3, s1 +; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB13_1 ; RV32-NEXT: .LBB13_6: # %atomicrmw.end ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -343,10 +343,10 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a0 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a0 ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB15_2 ; RV32-NEXT: .LBB15_1: # %atomicrmw.start @@ -356,17 +356,17 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s0 +; RV32-NEXT: mv a0, s1 ; RV32-NEXT: call __atomic_compare_exchange_8 ; RV32-NEXT: lw a4, 8(sp) ; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB15_6 ; RV32-NEXT: .LBB15_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s1, .LBB15_4 +; RV32-NEXT: beq a5, s0, .LBB15_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: slt a0, s1, a5 +; RV32-NEXT: slt a0, s0, a5 ; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB15_1 @@ -379,7 +379,7 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: .LBB15_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 ; RV32-NEXT: mv a2, s2 -; RV32-NEXT: mv a3, s1 +; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB15_1 ; RV32-NEXT: .LBB15_6: # %atomicrmw.end ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -419,10 +419,10 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a0 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a0 ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB17_2 ; RV32-NEXT: .LBB17_1: # %atomicrmw.start @@ -432,17 +432,17 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s0 +; RV32-NEXT: mv a0, s1 ; RV32-NEXT: call __atomic_compare_exchange_8 ; RV32-NEXT: lw a4, 8(sp) ; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB17_6 ; RV32-NEXT: .LBB17_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s1, .LBB17_4 +; RV32-NEXT: beq a5, s0, .LBB17_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sltu a0, s1, a5 +; RV32-NEXT: sltu a0, s0, a5 ; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB17_1 @@ -455,7 +455,7 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: .LBB17_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 ; RV32-NEXT: mv a2, s2 -; RV32-NEXT: mv a3, s1 +; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB17_1 ; RV32-NEXT: .LBB17_6: # %atomicrmw.end ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index 469edacb391df..81518541477a8 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -46,8 +46,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -76,8 +76,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -96,8 +96,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -140,8 +140,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -160,8 +160,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -190,8 +190,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -210,8 +210,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -230,8 +230,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -250,8 +250,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -294,8 +294,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -314,8 +314,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -344,8 +344,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -364,8 +364,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -384,8 +384,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -404,8 +404,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -448,8 +448,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -468,8 +468,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -498,8 +498,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -518,8 +518,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -538,8 +538,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -558,8 +558,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -602,8 +602,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -632,8 +632,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -652,8 +652,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -1636,8 +1636,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -1666,8 +1666,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -1686,8 +1686,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB15_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -1730,8 +1730,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -1750,8 +1750,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -1780,8 +1780,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -1800,8 +1800,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -1820,8 +1820,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -1840,8 +1840,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB16_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -1884,8 +1884,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -1904,8 +1904,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -1934,8 +1934,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -1954,8 +1954,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -1974,8 +1974,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -1994,8 +1994,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB17_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -2038,8 +2038,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -2058,8 +2058,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -2088,8 +2088,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -2108,8 +2108,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -2128,8 +2128,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -2148,8 +2148,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB18_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -2192,8 +2192,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -2222,8 +2222,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -2242,8 +2242,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB19_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -2286,8 +2286,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB20_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -2316,8 +2316,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB20_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -2336,8 +2336,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB20_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -2382,8 +2382,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -2402,8 +2402,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -2432,8 +2432,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -2452,8 +2452,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -2472,8 +2472,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -2492,8 +2492,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -2538,8 +2538,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -2558,8 +2558,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -2588,8 +2588,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -2608,8 +2608,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -2628,8 +2628,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -2648,8 +2648,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -2694,8 +2694,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -2714,8 +2714,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -2744,8 +2744,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -2764,8 +2764,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -2784,8 +2784,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -2804,8 +2804,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB23_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -2850,8 +2850,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB24_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -2880,8 +2880,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB24_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -2900,8 +2900,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB24_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -2946,9 +2946,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 +; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: or a1, a1, a3 ; RV32IA-NEXT: amoand.w a1, a1, (a2) @@ -2970,9 +2970,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 +; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: not a3, a3 -; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-NOZACAS-NEXT: amoand.w a1, a1, (a2) @@ -2984,9 +2984,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 +; RV64IA-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: not a3, a3 -; RV64IA-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-ZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3022,9 +3022,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: not a3, a3 -; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) @@ -3036,9 +3036,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: not a3, a3 -; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) @@ -3060,9 +3060,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 +; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: not a3, a3 -; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aq a1, a1, (a2) @@ -3074,9 +3074,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 +; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: not a3, a3 -; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3088,9 +3088,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 +; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: not a3, a3 -; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aq a1, a1, (a2) @@ -3102,9 +3102,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 +; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: not a3, a3 -; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3140,9 +3140,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: not a3, a3 -; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) @@ -3154,9 +3154,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: not a3, a3 -; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) @@ -3178,9 +3178,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 +; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: not a3, a3 -; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.rl a1, a1, (a2) @@ -3192,9 +3192,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 +; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: not a3, a3 -; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3206,9 +3206,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 +; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: not a3, a3 -; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.rl a1, a1, (a2) @@ -3220,9 +3220,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 +; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: not a3, a3 -; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3258,9 +3258,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: not a3, a3 -; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3272,9 +3272,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: not a3, a3 -; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) @@ -3296,9 +3296,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 +; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: not a3, a3 -; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3310,9 +3310,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 +; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: not a3, a3 -; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3324,9 +3324,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 +; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: not a3, a3 -; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3338,9 +3338,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 +; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: not a3, a3 -; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3376,9 +3376,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: not a3, a3 -; RV32IA-WMO-NEXT: andi a1, a1, 255 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3390,9 +3390,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: not a3, a3 -; RV32IA-TSO-NEXT: andi a1, a1, 255 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) @@ -3414,9 +3414,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 +; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: not a3, a3 -; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3428,9 +3428,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 +; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: not a3, a3 -; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3442,9 +3442,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 +; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: not a3, a3 -; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) @@ -3456,9 +3456,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 +; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: not a3, a3 -; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) @@ -3494,8 +3494,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB30_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -3525,8 +3525,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB30_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -3546,8 +3546,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB30_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -3567,8 +3567,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: .LBB30_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -3588,8 +3588,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: .LBB30_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -3653,8 +3653,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -3674,8 +3674,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -3705,8 +3705,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -3726,8 +3726,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -3747,8 +3747,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -3768,8 +3768,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -3789,8 +3789,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -3810,8 +3810,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: .LBB31_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -3875,8 +3875,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -3896,8 +3896,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -3927,8 +3927,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -3948,8 +3948,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -3969,8 +3969,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -3990,8 +3990,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -4011,8 +4011,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -4032,8 +4032,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: .LBB32_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -4097,8 +4097,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -4118,8 +4118,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -4149,8 +4149,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -4170,8 +4170,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -4191,8 +4191,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -4212,8 +4212,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -4233,8 +4233,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -4254,8 +4254,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: .LBB33_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lr.w a4, (a2) @@ -4319,8 +4319,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -4350,8 +4350,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -4371,8 +4371,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -4392,8 +4392,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -4413,8 +4413,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: .LBB34_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -4436,8 +4436,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZABHA-ZACAS-NEXT: .LBB34_1: # %atomicrmw.start ; RV64IA-WMO-ZABHA-ZACAS-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: and a3, a0, a1 -; RV64IA-WMO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: fence rw, rw +; RV64IA-WMO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: slli a4, a0, 56 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: amocas.b.aqrl a0, a3, (a2) ; RV64IA-WMO-ZABHA-ZACAS-NEXT: srai a4, a4, 56 @@ -4452,8 +4452,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZABHA-ZACAS-NEXT: .LBB34_1: # %atomicrmw.start ; RV64IA-TSO-ZABHA-ZACAS-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: and a3, a0, a1 -; RV64IA-TSO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: fence rw, rw +; RV64IA-TSO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: slli a4, a0, 56 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: amocas.b a0, a3, (a2) ; RV64IA-TSO-ZABHA-ZACAS-NEXT: srai a4, a4, 56 @@ -5391,23 +5391,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB45_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) @@ -5462,23 +5462,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: li a4, 255 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: li a3, 255 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-NOZACAS-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-NOZACAS-NEXT: mv a6, a5 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB45_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -5491,23 +5491,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: li a3, 255 ; RV64IA-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-ZACAS-NEXT: .LBB45_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-ZACAS-NEXT: mv a6, a5 -; RV64IA-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB45_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB45_1 Depth=1 ; RV64IA-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB45_3: # in Loop: Header=BB45_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -5576,23 +5576,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB46_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w a6, a6, (a2) @@ -5605,23 +5605,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB46_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -5676,23 +5676,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB46_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -5705,23 +5705,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB46_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -5734,23 +5734,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB46_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -5763,23 +5763,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB46_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB46_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB46_3: # in Loop: Header=BB46_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -5848,23 +5848,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB47_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) @@ -5877,23 +5877,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB47_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -5948,23 +5948,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB47_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -5977,23 +5977,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB47_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -6006,23 +6006,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB47_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -6035,23 +6035,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB47_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB47_3: # in Loop: Header=BB47_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -6120,23 +6120,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB48_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) @@ -6149,23 +6149,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB48_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -6220,23 +6220,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB48_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -6249,23 +6249,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB48_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -6278,23 +6278,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB48_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -6307,23 +6307,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB48_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB48_3: # in Loop: Header=BB48_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -6392,23 +6392,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB49_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1 ; RV32IA-NEXT: sc.w.rl a6, a6, (a2) @@ -6463,23 +6463,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: li a4, 255 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: li a3, 255 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-NOZACAS-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-NOZACAS-NEXT: mv a6, a5 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB49_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -6492,23 +6492,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: li a3, 255 ; RV64IA-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-ZACAS-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-ZACAS-NEXT: mv a6, a5 -; RV64IA-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB49_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB49_1 Depth=1 ; RV64IA-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB49_3: # in Loop: Header=BB49_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -6577,23 +6577,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB50_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) @@ -6648,23 +6648,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: li a4, 255 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: li a3, 255 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-NOZACAS-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-NOZACAS-NEXT: mv a6, a5 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB50_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -6677,23 +6677,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: li a3, 255 ; RV64IA-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-ZACAS-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-ZACAS-NEXT: mv a6, a5 -; RV64IA-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB50_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB50_1 Depth=1 ; RV64IA-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB50_3: # in Loop: Header=BB50_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -6762,23 +6762,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB51_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w a6, a6, (a2) @@ -6791,23 +6791,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB51_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -6862,23 +6862,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB51_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -6891,23 +6891,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB51_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -6920,23 +6920,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB51_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -6949,23 +6949,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB51_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB51_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB51_3: # in Loop: Header=BB51_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -7034,23 +7034,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB52_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) @@ -7063,23 +7063,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB52_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -7134,23 +7134,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB52_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7163,23 +7163,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB52_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -7192,23 +7192,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB52_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7221,23 +7221,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB52_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB52_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB52_3: # in Loop: Header=BB52_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -7306,23 +7306,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: li a4, 255 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: li a3, 255 ; RV32IA-WMO-NEXT: slli a1, a1, 24 +; RV32IA-WMO-NEXT: andi a4, a0, 24 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: srai a1, a1, 24 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: xori a3, a3, 24 +; RV32IA-WMO-NEXT: xori a4, a4, 24 ; RV32IA-WMO-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) -; RV32IA-WMO-NEXT: and a7, a5, a4 +; RV32IA-WMO-NEXT: and a7, a5, a3 ; RV32IA-WMO-NEXT: mv a6, a5 -; RV32IA-WMO-NEXT: sll a7, a7, a3 -; RV32IA-WMO-NEXT: sra a7, a7, a3 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB53_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV32IA-WMO-NEXT: xor a6, a5, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 +; RV32IA-WMO-NEXT: and a6, a6, a3 ; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) @@ -7335,23 +7335,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: li a4, 255 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: li a3, 255 ; RV32IA-TSO-NEXT: slli a1, a1, 24 +; RV32IA-TSO-NEXT: andi a4, a0, 24 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: srai a1, a1, 24 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: xori a3, a3, 24 +; RV32IA-TSO-NEXT: xori a4, a4, 24 ; RV32IA-TSO-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a5, (a2) -; RV32IA-TSO-NEXT: and a7, a5, a4 +; RV32IA-TSO-NEXT: and a7, a5, a3 ; RV32IA-TSO-NEXT: mv a6, a5 -; RV32IA-TSO-NEXT: sll a7, a7, a3 -; RV32IA-TSO-NEXT: sra a7, a7, a3 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB53_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV32IA-TSO-NEXT: xor a6, a5, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 +; RV32IA-TSO-NEXT: and a6, a6, a3 ; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) @@ -7406,23 +7406,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: li a4, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-NOZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB53_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7435,23 +7435,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: li a4, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-NOZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB53_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) @@ -7464,23 +7464,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: li a4, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: li a3, 255 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-WMO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-WMO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-WMO-ZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB53_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7493,23 +7493,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: li a4, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: li a3, 255 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-TSO-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-TSO-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-TSO-ZACAS-NEXT: .LBB53_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB53_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB53_3: # in Loop: Header=BB53_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) @@ -7578,23 +7578,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB54_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1 ; RV32IA-NEXT: sc.w.rl a6, a6, (a2) @@ -7649,23 +7649,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: li a4, 255 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: li a3, 255 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 56 +; RV64IA-NOZACAS-NEXT: andi a4, a0, 24 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 56 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: xori a3, a3, 56 +; RV64IA-NOZACAS-NEXT: xori a4, a4, 56 ; RV64IA-NOZACAS-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a5, a4 +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 ; RV64IA-NOZACAS-NEXT: mv a6, a5 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a3 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a3 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB54_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 ; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7678,23 +7678,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: li a4, 255 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: li a3, 255 ; RV64IA-ZACAS-NEXT: slli a1, a1, 56 +; RV64IA-ZACAS-NEXT: andi a4, a0, 24 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: srai a1, a1, 56 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: xori a3, a3, 56 +; RV64IA-ZACAS-NEXT: xori a4, a4, 56 ; RV64IA-ZACAS-NEXT: .LBB54_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-ZACAS-NEXT: and a7, a5, a4 +; RV64IA-ZACAS-NEXT: and a7, a5, a3 ; RV64IA-ZACAS-NEXT: mv a6, a5 -; RV64IA-ZACAS-NEXT: sll a7, a7, a3 -; RV64IA-ZACAS-NEXT: sra a7, a7, a3 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB54_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB54_1 Depth=1 ; RV64IA-ZACAS-NEXT: xor a6, a5, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 ; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB54_3: # in Loop: Header=BB54_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2) @@ -7762,8 +7762,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB55_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -7826,8 +7826,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB55_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -7850,8 +7850,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB55_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -7928,8 +7928,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -7952,8 +7952,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -8016,8 +8016,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -8040,8 +8040,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -8064,8 +8064,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -8088,8 +8088,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB56_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -8166,8 +8166,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -8190,8 +8190,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -8254,8 +8254,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -8278,8 +8278,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -8302,8 +8302,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -8326,8 +8326,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB57_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -8404,8 +8404,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -8428,8 +8428,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -8492,8 +8492,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -8516,8 +8516,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -8540,8 +8540,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -8564,8 +8564,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB58_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -8642,8 +8642,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB59_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -8706,8 +8706,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB59_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -8730,8 +8730,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB59_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -8808,8 +8808,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB60_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -8872,8 +8872,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB60_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w a4, (a2) @@ -8896,8 +8896,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB60_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w a4, (a2) @@ -8974,8 +8974,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -8998,8 +8998,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -9062,8 +9062,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -9086,8 +9086,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -9110,8 +9110,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -9134,8 +9134,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB61_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -9212,8 +9212,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w a4, (a2) @@ -9236,8 +9236,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -9300,8 +9300,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w a4, (a2) @@ -9324,8 +9324,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -9348,8 +9348,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w a4, (a2) @@ -9372,8 +9372,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB62_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -9450,8 +9450,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 ; RV32IA-WMO-NEXT: li a3, 255 -; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 ; RV32IA-WMO-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-WMO-NEXT: lr.w.aq a4, (a2) @@ -9474,8 +9474,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 ; RV32IA-TSO-NEXT: li a3, 255 -; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 ; RV32IA-TSO-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-TSO-NEXT: lr.w a4, (a2) @@ -9538,8 +9538,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-NOZACAS-NEXT: li a3, 255 -; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a4, (a2) @@ -9562,8 +9562,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-NOZACAS-NEXT: li a3, 255 -; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: lr.w a4, (a2) @@ -9586,8 +9586,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-WMO-ZACAS-NEXT: li a3, 255 -; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a4, (a2) @@ -9610,8 +9610,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-TSO-ZACAS-NEXT: li a3, 255 -; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: .LBB63_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZACAS-NEXT: lr.w a4, (a2) @@ -9688,8 +9688,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB64_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w.aqrl a4, (a2) @@ -9752,8 +9752,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 ; RV64IA-NOZACAS-NEXT: li a3, 255 -; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: andi a1, a1, 255 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-NOZACAS-NEXT: .LBB64_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NOZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -9776,8 +9776,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 ; RV64IA-ZACAS-NEXT: li a3, 255 -; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: andi a1, a1, 255 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 ; RV64IA-ZACAS-NEXT: .LBB64_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-ZACAS-NEXT: lr.w.aqrl a4, (a2) @@ -12862,10 +12862,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 ; RV32IA-NEXT: and a1, a1, a3 +; RV32IA-NEXT: not a3, a4 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a1, a4 +; RV32IA-NEXT: or a1, a1, a3 ; RV32IA-NEXT: amoand.w a1, a1, (a2) ; RV32IA-NEXT: srl a0, a1, a0 ; RV32IA-NEXT: ret @@ -12887,10 +12887,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA-NOZACAS-NEXT: lui a3, 16 ; RV64IA-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-NOZACAS-NEXT: not a4, a4 ; RV64IA-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-NOZACAS-NEXT: not a3, a4 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-NOZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-NOZACAS-NEXT: ret @@ -12902,10 +12902,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA-ZACAS-NEXT: lui a3, 16 ; RV64IA-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-ZACAS-NEXT: not a4, a4 ; RV64IA-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-ZACAS-NEXT: not a3, a4 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-ZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-ZACAS-NEXT: ret @@ -12941,10 +12941,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: sll a4, a3, a0 -; RV32IA-WMO-NEXT: not a4, a4 ; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: not a3, a4 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: or a1, a1, a4 +; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) ; RV32IA-WMO-NEXT: srl a0, a1, a0 ; RV32IA-WMO-NEXT: ret @@ -12956,10 +12956,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: sll a4, a3, a0 -; RV32IA-TSO-NEXT: not a4, a4 ; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: not a3, a4 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: or a1, a1, a4 +; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) ; RV32IA-TSO-NEXT: srl a0, a1, a0 ; RV32IA-TSO-NEXT: ret @@ -12981,10 +12981,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-NOZACAS-NEXT: not a4, a4 ; RV64IA-WMO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-NOZACAS-NEXT: not a3, a4 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aq a1, a1, (a2) ; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret @@ -12996,10 +12996,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-NOZACAS-NEXT: not a4, a4 ; RV64IA-TSO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-NOZACAS-NEXT: not a3, a4 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret @@ -13011,10 +13011,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-ZACAS-NEXT: not a4, a4 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-ZACAS-NEXT: not a3, a4 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aq a1, a1, (a2) ; RV64IA-WMO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: ret @@ -13026,10 +13026,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-ZACAS-NEXT: not a4, a4 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-ZACAS-NEXT: not a3, a4 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: ret @@ -13065,10 +13065,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: sll a4, a3, a0 -; RV32IA-WMO-NEXT: not a4, a4 ; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: not a3, a4 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: or a1, a1, a4 +; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) ; RV32IA-WMO-NEXT: srl a0, a1, a0 ; RV32IA-WMO-NEXT: ret @@ -13080,10 +13080,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: sll a4, a3, a0 -; RV32IA-TSO-NEXT: not a4, a4 ; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: not a3, a4 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: or a1, a1, a4 +; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) ; RV32IA-TSO-NEXT: srl a0, a1, a0 ; RV32IA-TSO-NEXT: ret @@ -13105,10 +13105,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-NOZACAS-NEXT: not a4, a4 ; RV64IA-WMO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-NOZACAS-NEXT: not a3, a4 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.rl a1, a1, (a2) ; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret @@ -13120,10 +13120,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-NOZACAS-NEXT: not a4, a4 ; RV64IA-TSO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-NOZACAS-NEXT: not a3, a4 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret @@ -13135,10 +13135,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-ZACAS-NEXT: not a4, a4 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-ZACAS-NEXT: not a3, a4 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.rl a1, a1, (a2) ; RV64IA-WMO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: ret @@ -13150,10 +13150,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-ZACAS-NEXT: not a4, a4 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-ZACAS-NEXT: not a3, a4 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: ret @@ -13189,10 +13189,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: sll a4, a3, a0 -; RV32IA-WMO-NEXT: not a4, a4 ; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: not a3, a4 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: or a1, a1, a4 +; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV32IA-WMO-NEXT: srl a0, a1, a0 ; RV32IA-WMO-NEXT: ret @@ -13204,10 +13204,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: sll a4, a3, a0 -; RV32IA-TSO-NEXT: not a4, a4 ; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: not a3, a4 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: or a1, a1, a4 +; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) ; RV32IA-TSO-NEXT: srl a0, a1, a0 ; RV32IA-TSO-NEXT: ret @@ -13229,10 +13229,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-NOZACAS-NEXT: not a4, a4 ; RV64IA-WMO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-NOZACAS-NEXT: not a3, a4 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret @@ -13244,10 +13244,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-NOZACAS-NEXT: not a4, a4 ; RV64IA-TSO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-NOZACAS-NEXT: not a3, a4 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret @@ -13259,10 +13259,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-ZACAS-NEXT: not a4, a4 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-ZACAS-NEXT: not a3, a4 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV64IA-WMO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: ret @@ -13274,10 +13274,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-ZACAS-NEXT: not a4, a4 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-ZACAS-NEXT: not a3, a4 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: ret @@ -13313,10 +13313,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: sll a4, a3, a0 -; RV32IA-WMO-NEXT: not a4, a4 ; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: not a3, a4 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: or a1, a1, a4 +; RV32IA-WMO-NEXT: or a1, a1, a3 ; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV32IA-WMO-NEXT: srl a0, a1, a0 ; RV32IA-WMO-NEXT: ret @@ -13328,10 +13328,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: sll a4, a3, a0 -; RV32IA-TSO-NEXT: not a4, a4 ; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: not a3, a4 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: or a1, a1, a4 +; RV32IA-TSO-NEXT: or a1, a1, a3 ; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) ; RV32IA-TSO-NEXT: srl a0, a1, a0 ; RV32IA-TSO-NEXT: ret @@ -13353,10 +13353,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-NOZACAS-NEXT: not a4, a4 ; RV64IA-WMO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-NOZACAS-NEXT: not a3, a4 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-NOZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret @@ -13368,10 +13368,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-NOZACAS-NEXT: not a4, a4 ; RV64IA-TSO-NOZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-NOZACAS-NEXT: not a3, a4 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-NOZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-NOZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret @@ -13383,10 +13383,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-WMO-ZACAS-NEXT: not a4, a4 ; RV64IA-WMO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-WMO-ZACAS-NEXT: not a3, a4 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-WMO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-WMO-ZACAS-NEXT: amoand.w.aqrl a1, a1, (a2) ; RV64IA-WMO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-WMO-ZACAS-NEXT: ret @@ -13398,10 +13398,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: sllw a4, a3, a0 -; RV64IA-TSO-ZACAS-NEXT: not a4, a4 ; RV64IA-TSO-ZACAS-NEXT: and a1, a1, a3 +; RV64IA-TSO-ZACAS-NEXT: not a3, a4 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a4 +; RV64IA-TSO-ZACAS-NEXT: or a1, a1, a3 ; RV64IA-TSO-ZACAS-NEXT: amoand.w a1, a1, (a2) ; RV64IA-TSO-ZACAS-NEXT: srlw a0, a1, a0 ; RV64IA-TSO-ZACAS-NEXT: ret @@ -14411,8 +14411,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-ZABHA-ZACAS-NEXT: .LBB99_1: # %atomicrmw.start ; RV64IA-WMO-ZABHA-ZACAS-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: and a3, a0, a1 -; RV64IA-WMO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: fence rw, rw +; RV64IA-WMO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: slli a4, a0, 48 ; RV64IA-WMO-ZABHA-ZACAS-NEXT: amocas.h.aqrl a0, a3, (a2) ; RV64IA-WMO-ZABHA-ZACAS-NEXT: srai a4, a4, 48 @@ -14427,8 +14427,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-TSO-ZABHA-ZACAS-NEXT: .LBB99_1: # %atomicrmw.start ; RV64IA-TSO-ZABHA-ZACAS-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: and a3, a0, a1 -; RV64IA-TSO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: fence rw, rw +; RV64IA-TSO-ZABHA-ZACAS-NEXT: not a3, a3 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: slli a4, a0, 48 ; RV64IA-TSO-ZABHA-ZACAS-NEXT: amocas.h a0, a3, (a2) ; RV64IA-TSO-ZABHA-ZACAS-NEXT: srai a4, a4, 48 @@ -15420,31 +15420,31 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB110_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB110_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i16_monotonic: @@ -15493,62 +15493,62 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: lui a4, 16 -; RV64IA-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: lui a3, 16 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-NOZACAS-NEXT: li a4, 48 +; RV64IA-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: li a5, 48 -; RV64IA-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-NOZACAS-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-NOZACAS-NEXT: mv a6, a3 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-NOZACAS-NEXT: mv a6, a5 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB110_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-NOZACAS-NEXT: bnez a6, .LBB110_1 ; RV64IA-NOZACAS-NEXT: # %bb.4: -; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-NOZACAS-NEXT: ret ; ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_monotonic: ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: lui a4, 16 -; RV64IA-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: lui a3, 16 ; RV64IA-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-ZACAS-NEXT: li a4, 48 +; RV64IA-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: li a5, 48 -; RV64IA-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-ZACAS-NEXT: .LBB110_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-ZACAS-NEXT: mv a6, a3 -; RV64IA-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-ZACAS-NEXT: mv a6, a5 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB110_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB110_1 Depth=1 -; RV64IA-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB110_3: # in Loop: Header=BB110_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-ZACAS-NEXT: bnez a6, .LBB110_1 ; RV64IA-ZACAS-NEXT: # %bb.4: -; RV64IA-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_monotonic: @@ -15611,62 +15611,62 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w.aq a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB111_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB111_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acquire: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB111_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB111_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i16_acquire: @@ -15715,124 +15715,124 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB111_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB111_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acquire: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB111_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB111_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acquire: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB111_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB111_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acquire: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB111_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB111_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB111_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB111_3: # in Loop: Header=BB111_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB111_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acquire: @@ -15895,62 +15895,62 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB112_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB112_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_max_i16_release: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB112_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB112_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i16_release: @@ -15999,124 +15999,124 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB112_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB112_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_release: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB112_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB112_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_release: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB112_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB112_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_release: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB112_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB112_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB112_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB112_3: # in Loop: Header=BB112_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB112_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_release: @@ -16179,62 +16179,62 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w.aq a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a7, a1, .LBB113_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB113_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acq_rel: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a7, a1, .LBB113_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB113_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i16_acq_rel: @@ -16283,124 +16283,124 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a7, a1, .LBB113_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB113_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a7, a1, .LBB113_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB113_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a7, a1, .LBB113_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB113_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB113_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a7, a1, .LBB113_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB113_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB113_3: # in Loop: Header=BB113_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB113_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acq_rel: @@ -16463,31 +16463,31 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w.aqrl a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w.aqrl a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB114_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1 ; RV32IA-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB114_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i16_seq_cst: @@ -16536,62 +16536,62 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: lui a4, 16 -; RV64IA-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: lui a3, 16 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-NOZACAS-NEXT: li a4, 48 +; RV64IA-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: li a5, 48 -; RV64IA-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-NOZACAS-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NOZACAS-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-NOZACAS-NEXT: mv a6, a3 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-NOZACAS-NEXT: mv a6, a5 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a7, a1, .LBB114_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-NOZACAS-NEXT: bnez a6, .LBB114_1 ; RV64IA-NOZACAS-NEXT: # %bb.4: -; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-NOZACAS-NEXT: ret ; ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_seq_cst: ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: lui a4, 16 -; RV64IA-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: lui a3, 16 ; RV64IA-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-ZACAS-NEXT: li a4, 48 +; RV64IA-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: li a5, 48 -; RV64IA-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-ZACAS-NEXT: .LBB114_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-ZACAS-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-ZACAS-NEXT: mv a6, a3 -; RV64IA-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-ZACAS-NEXT: mv a6, a5 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a7, a1, .LBB114_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB114_1 Depth=1 -; RV64IA-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB114_3: # in Loop: Header=BB114_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-ZACAS-NEXT: bnez a6, .LBB114_1 ; RV64IA-ZACAS-NEXT: # %bb.4: -; RV64IA-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_seq_cst: @@ -16654,31 +16654,31 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB115_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB115_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i16_monotonic: @@ -16727,62 +16727,62 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: lui a4, 16 -; RV64IA-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: lui a3, 16 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-NOZACAS-NEXT: li a4, 48 +; RV64IA-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: li a5, 48 -; RV64IA-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-NOZACAS-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-NOZACAS-NEXT: mv a6, a3 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-NOZACAS-NEXT: mv a6, a5 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB115_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-NOZACAS-NEXT: bnez a6, .LBB115_1 ; RV64IA-NOZACAS-NEXT: # %bb.4: -; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-NOZACAS-NEXT: ret ; ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_monotonic: ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: lui a4, 16 -; RV64IA-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: lui a3, 16 ; RV64IA-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-ZACAS-NEXT: li a4, 48 +; RV64IA-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: li a5, 48 -; RV64IA-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-ZACAS-NEXT: .LBB115_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-ZACAS-NEXT: mv a6, a3 -; RV64IA-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-ZACAS-NEXT: mv a6, a5 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB115_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB115_1 Depth=1 -; RV64IA-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB115_3: # in Loop: Header=BB115_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-ZACAS-NEXT: bnez a6, .LBB115_1 ; RV64IA-ZACAS-NEXT: # %bb.4: -; RV64IA-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_monotonic: @@ -16845,62 +16845,62 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w.aq a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB116_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB116_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acquire: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB116_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB116_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i16_acquire: @@ -16949,124 +16949,124 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB116_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB116_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acquire: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB116_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB116_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acquire: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB116_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB116_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acquire: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB116_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB116_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB116_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB116_3: # in Loop: Header=BB116_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB116_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acquire: @@ -17129,62 +17129,62 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB117_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB117_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_min_i16_release: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB117_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB117_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i16_release: @@ -17233,124 +17233,124 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB117_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB117_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_release: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB117_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB117_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_release: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB117_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB117_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_release: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB117_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB117_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB117_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB117_3: # in Loop: Header=BB117_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB117_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_release: @@ -17413,62 +17413,62 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32IA-WMO: # %bb.0: ; RV32IA-WMO-NEXT: andi a2, a0, -4 ; RV32IA-WMO-NEXT: slli a0, a0, 3 -; RV32IA-WMO-NEXT: andi a3, a0, 24 -; RV32IA-WMO-NEXT: lui a4, 16 -; RV32IA-WMO-NEXT: addi a4, a4, -1 -; RV32IA-WMO-NEXT: sll a4, a4, a0 +; RV32IA-WMO-NEXT: lui a3, 16 ; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: li a4, 16 +; RV32IA-WMO-NEXT: andi a5, a0, 24 +; RV32IA-WMO-NEXT: addi a3, a3, -1 ; RV32IA-WMO-NEXT: srai a1, a1, 16 +; RV32IA-WMO-NEXT: sll a3, a3, a0 ; RV32IA-WMO-NEXT: sll a1, a1, a0 -; RV32IA-WMO-NEXT: li a5, 16 -; RV32IA-WMO-NEXT: sub a5, a5, a3 +; RV32IA-WMO-NEXT: sub a4, a4, a5 ; RV32IA-WMO-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-WMO-NEXT: lr.w.aq a3, (a2) -; RV32IA-WMO-NEXT: and a7, a3, a4 -; RV32IA-WMO-NEXT: mv a6, a3 -; RV32IA-WMO-NEXT: sll a7, a7, a5 -; RV32IA-WMO-NEXT: sra a7, a7, a5 +; RV32IA-WMO-NEXT: lr.w.aq a5, (a2) +; RV32IA-WMO-NEXT: and a7, a5, a3 +; RV32IA-WMO-NEXT: mv a6, a5 +; RV32IA-WMO-NEXT: sll a7, a7, a4 +; RV32IA-WMO-NEXT: sra a7, a7, a4 ; RV32IA-WMO-NEXT: bge a1, a7, .LBB118_3 ; RV32IA-WMO-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV32IA-WMO-NEXT: xor a6, a3, a1 -; RV32IA-WMO-NEXT: and a6, a6, a4 -; RV32IA-WMO-NEXT: xor a6, a3, a6 +; RV32IA-WMO-NEXT: xor a6, a5, a1 +; RV32IA-WMO-NEXT: and a6, a6, a3 +; RV32IA-WMO-NEXT: xor a6, a5, a6 ; RV32IA-WMO-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV32IA-WMO-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-WMO-NEXT: bnez a6, .LBB118_1 ; RV32IA-WMO-NEXT: # %bb.4: -; RV32IA-WMO-NEXT: srl a0, a3, a0 +; RV32IA-WMO-NEXT: srl a0, a5, a0 ; RV32IA-WMO-NEXT: ret ; ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acq_rel: ; RV32IA-TSO: # %bb.0: ; RV32IA-TSO-NEXT: andi a2, a0, -4 ; RV32IA-TSO-NEXT: slli a0, a0, 3 -; RV32IA-TSO-NEXT: andi a3, a0, 24 -; RV32IA-TSO-NEXT: lui a4, 16 -; RV32IA-TSO-NEXT: addi a4, a4, -1 -; RV32IA-TSO-NEXT: sll a4, a4, a0 +; RV32IA-TSO-NEXT: lui a3, 16 ; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: li a4, 16 +; RV32IA-TSO-NEXT: andi a5, a0, 24 +; RV32IA-TSO-NEXT: addi a3, a3, -1 ; RV32IA-TSO-NEXT: srai a1, a1, 16 +; RV32IA-TSO-NEXT: sll a3, a3, a0 ; RV32IA-TSO-NEXT: sll a1, a1, a0 -; RV32IA-TSO-NEXT: li a5, 16 -; RV32IA-TSO-NEXT: sub a5, a5, a3 +; RV32IA-TSO-NEXT: sub a4, a4, a5 ; RV32IA-TSO-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-TSO-NEXT: lr.w a3, (a2) -; RV32IA-TSO-NEXT: and a7, a3, a4 -; RV32IA-TSO-NEXT: mv a6, a3 -; RV32IA-TSO-NEXT: sll a7, a7, a5 -; RV32IA-TSO-NEXT: sra a7, a7, a5 +; RV32IA-TSO-NEXT: lr.w a5, (a2) +; RV32IA-TSO-NEXT: and a7, a5, a3 +; RV32IA-TSO-NEXT: mv a6, a5 +; RV32IA-TSO-NEXT: sll a7, a7, a4 +; RV32IA-TSO-NEXT: sra a7, a7, a4 ; RV32IA-TSO-NEXT: bge a1, a7, .LBB118_3 ; RV32IA-TSO-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV32IA-TSO-NEXT: xor a6, a3, a1 -; RV32IA-TSO-NEXT: and a6, a6, a4 -; RV32IA-TSO-NEXT: xor a6, a3, a6 +; RV32IA-TSO-NEXT: xor a6, a5, a1 +; RV32IA-TSO-NEXT: and a6, a6, a3 +; RV32IA-TSO-NEXT: xor a6, a5, a6 ; RV32IA-TSO-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV32IA-TSO-NEXT: sc.w a6, a6, (a2) ; RV32IA-TSO-NEXT: bnez a6, .LBB118_1 ; RV32IA-TSO-NEXT: # %bb.4: -; RV32IA-TSO-NEXT: srl a0, a3, a0 +; RV32IA-TSO-NEXT: srl a0, a5, a0 ; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i16_acq_rel: @@ -17517,124 +17517,124 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64IA-WMO-NOZACAS: # %bb.0: ; RV64IA-WMO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: li a4, 48 +; RV64IA-WMO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-NOZACAS-NEXT: li a5, 48 -; RV64IA-WMO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-NOZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-NOZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-NOZACAS-NEXT: bge a1, a7, .LBB118_3 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-NOZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV64IA-WMO-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-NOZACAS-NEXT: bnez a6, .LBB118_1 ; RV64IA-WMO-NOZACAS-NEXT: # %bb.4: -; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-NOZACAS-NEXT: ret ; ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel: ; RV64IA-TSO-NOZACAS: # %bb.0: ; RV64IA-TSO-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-NOZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-NOZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: li a4, 48 +; RV64IA-TSO-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-NOZACAS-NEXT: li a5, 48 -; RV64IA-TSO-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-NOZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-NOZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-NOZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-NOZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-NOZACAS-NEXT: bge a1, a7, .LBB118_3 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-NOZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV64IA-TSO-NOZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-NOZACAS-NEXT: bnez a6, .LBB118_1 ; RV64IA-TSO-NOZACAS-NEXT: # %bb.4: -; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-NOZACAS-NEXT: ret ; ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel: ; RV64IA-WMO-ZACAS: # %bb.0: ; RV64IA-WMO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-WMO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-WMO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-WMO-ZACAS-NEXT: lui a4, 16 -; RV64IA-WMO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-WMO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-WMO-ZACAS-NEXT: lui a3, 16 ; RV64IA-WMO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: li a4, 48 +; RV64IA-WMO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-WMO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-WMO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-WMO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-WMO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-WMO-ZACAS-NEXT: li a5, 48 -; RV64IA-WMO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-WMO-ZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a3, (a2) -; RV64IA-WMO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-WMO-ZACAS-NEXT: mv a6, a3 -; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-WMO-ZACAS-NEXT: lr.w.aq a5, (a2) +; RV64IA-WMO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-WMO-ZACAS-NEXT: mv a6, a5 +; RV64IA-WMO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-WMO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-WMO-ZACAS-NEXT: bge a1, a7, .LBB118_3 ; RV64IA-WMO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-WMO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-WMO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-WMO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-WMO-ZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV64IA-WMO-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-WMO-ZACAS-NEXT: bnez a6, .LBB118_1 ; RV64IA-WMO-ZACAS-NEXT: # %bb.4: -; RV64IA-WMO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-WMO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-WMO-ZACAS-NEXT: ret ; ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel: ; RV64IA-TSO-ZACAS: # %bb.0: ; RV64IA-TSO-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-TSO-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-TSO-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-TSO-ZACAS-NEXT: lui a4, 16 -; RV64IA-TSO-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-TSO-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-TSO-ZACAS-NEXT: lui a3, 16 ; RV64IA-TSO-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: li a4, 48 +; RV64IA-TSO-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-TSO-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-TSO-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-TSO-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-TSO-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-TSO-ZACAS-NEXT: li a5, 48 -; RV64IA-TSO-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-TSO-ZACAS-NEXT: .LBB118_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-TSO-ZACAS-NEXT: lr.w a3, (a2) -; RV64IA-TSO-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-TSO-ZACAS-NEXT: mv a6, a3 -; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-TSO-ZACAS-NEXT: lr.w a5, (a2) +; RV64IA-TSO-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-TSO-ZACAS-NEXT: mv a6, a5 +; RV64IA-TSO-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-TSO-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-TSO-ZACAS-NEXT: bge a1, a7, .LBB118_3 ; RV64IA-TSO-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB118_1 Depth=1 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-TSO-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-TSO-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-TSO-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-TSO-ZACAS-NEXT: .LBB118_3: # in Loop: Header=BB118_1 Depth=1 ; RV64IA-TSO-ZACAS-NEXT: sc.w a6, a6, (a2) ; RV64IA-TSO-ZACAS-NEXT: bnez a6, .LBB118_1 ; RV64IA-TSO-ZACAS-NEXT: # %bb.4: -; RV64IA-TSO-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-TSO-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-TSO-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acq_rel: @@ -17697,31 +17697,31 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w.aqrl a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w.aqrl a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB119_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1 ; RV32IA-NEXT: sc.w.rl a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB119_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i16_seq_cst: @@ -17770,62 +17770,62 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64IA-NOZACAS: # %bb.0: ; RV64IA-NOZACAS-NEXT: andi a2, a0, -4 ; RV64IA-NOZACAS-NEXT: slli a0, a0, 3 -; RV64IA-NOZACAS-NEXT: andi a3, a0, 24 -; RV64IA-NOZACAS-NEXT: lui a4, 16 -; RV64IA-NOZACAS-NEXT: addi a4, a4, -1 -; RV64IA-NOZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-NOZACAS-NEXT: lui a3, 16 ; RV64IA-NOZACAS-NEXT: slli a1, a1, 48 +; RV64IA-NOZACAS-NEXT: li a4, 48 +; RV64IA-NOZACAS-NEXT: andi a5, a0, 24 +; RV64IA-NOZACAS-NEXT: addi a3, a3, -1 ; RV64IA-NOZACAS-NEXT: srai a1, a1, 48 +; RV64IA-NOZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-NOZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-NOZACAS-NEXT: li a5, 48 -; RV64IA-NOZACAS-NEXT: sub a5, a5, a3 +; RV64IA-NOZACAS-NEXT: sub a4, a4, a5 ; RV64IA-NOZACAS-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NOZACAS-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NOZACAS-NEXT: and a7, a3, a4 -; RV64IA-NOZACAS-NEXT: mv a6, a3 -; RV64IA-NOZACAS-NEXT: sll a7, a7, a5 -; RV64IA-NOZACAS-NEXT: sra a7, a7, a5 +; RV64IA-NOZACAS-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NOZACAS-NEXT: and a7, a5, a3 +; RV64IA-NOZACAS-NEXT: mv a6, a5 +; RV64IA-NOZACAS-NEXT: sll a7, a7, a4 +; RV64IA-NOZACAS-NEXT: sra a7, a7, a4 ; RV64IA-NOZACAS-NEXT: bge a1, a7, .LBB119_3 ; RV64IA-NOZACAS-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a1 -; RV64IA-NOZACAS-NEXT: and a6, a6, a4 -; RV64IA-NOZACAS-NEXT: xor a6, a3, a6 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a1 +; RV64IA-NOZACAS-NEXT: and a6, a6, a3 +; RV64IA-NOZACAS-NEXT: xor a6, a5, a6 ; RV64IA-NOZACAS-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1 ; RV64IA-NOZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-NOZACAS-NEXT: bnez a6, .LBB119_1 ; RV64IA-NOZACAS-NEXT: # %bb.4: -; RV64IA-NOZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-NOZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-NOZACAS-NEXT: ret ; ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_seq_cst: ; RV64IA-ZACAS: # %bb.0: ; RV64IA-ZACAS-NEXT: andi a2, a0, -4 ; RV64IA-ZACAS-NEXT: slli a0, a0, 3 -; RV64IA-ZACAS-NEXT: andi a3, a0, 24 -; RV64IA-ZACAS-NEXT: lui a4, 16 -; RV64IA-ZACAS-NEXT: addi a4, a4, -1 -; RV64IA-ZACAS-NEXT: sllw a4, a4, a0 +; RV64IA-ZACAS-NEXT: lui a3, 16 ; RV64IA-ZACAS-NEXT: slli a1, a1, 48 +; RV64IA-ZACAS-NEXT: li a4, 48 +; RV64IA-ZACAS-NEXT: andi a5, a0, 24 +; RV64IA-ZACAS-NEXT: addi a3, a3, -1 ; RV64IA-ZACAS-NEXT: srai a1, a1, 48 +; RV64IA-ZACAS-NEXT: sllw a3, a3, a0 ; RV64IA-ZACAS-NEXT: sllw a1, a1, a0 -; RV64IA-ZACAS-NEXT: li a5, 48 -; RV64IA-ZACAS-NEXT: sub a5, a5, a3 +; RV64IA-ZACAS-NEXT: sub a4, a4, a5 ; RV64IA-ZACAS-NEXT: .LBB119_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-ZACAS-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-ZACAS-NEXT: and a7, a3, a4 -; RV64IA-ZACAS-NEXT: mv a6, a3 -; RV64IA-ZACAS-NEXT: sll a7, a7, a5 -; RV64IA-ZACAS-NEXT: sra a7, a7, a5 +; RV64IA-ZACAS-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-ZACAS-NEXT: and a7, a5, a3 +; RV64IA-ZACAS-NEXT: mv a6, a5 +; RV64IA-ZACAS-NEXT: sll a7, a7, a4 +; RV64IA-ZACAS-NEXT: sra a7, a7, a4 ; RV64IA-ZACAS-NEXT: bge a1, a7, .LBB119_3 ; RV64IA-ZACAS-NEXT: # %bb.2: # in Loop: Header=BB119_1 Depth=1 -; RV64IA-ZACAS-NEXT: xor a6, a3, a1 -; RV64IA-ZACAS-NEXT: and a6, a6, a4 -; RV64IA-ZACAS-NEXT: xor a6, a3, a6 +; RV64IA-ZACAS-NEXT: xor a6, a5, a1 +; RV64IA-ZACAS-NEXT: and a6, a6, a3 +; RV64IA-ZACAS-NEXT: xor a6, a5, a6 ; RV64IA-ZACAS-NEXT: .LBB119_3: # in Loop: Header=BB119_1 Depth=1 ; RV64IA-ZACAS-NEXT: sc.w.rl a6, a6, (a2) ; RV64IA-ZACAS-NEXT: bnez a6, .LBB119_1 ; RV64IA-ZACAS-NEXT: # %bb.4: -; RV64IA-ZACAS-NEXT: srlw a0, a3, a0 +; RV64IA-ZACAS-NEXT: srlw a0, a5, a0 ; RV64IA-ZACAS-NEXT: ret ; ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_seq_cst: @@ -26072,46 +26072,46 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB220_2 ; RV32I-NEXT: .LBB220_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB220_7 ; RV32I-NEXT: .LBB220_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB220_4 +; RV32I-NEXT: beq a5, s0, .LBB220_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB220_5 ; RV32I-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB220_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB220_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB220_1 ; RV32I-NEXT: .LBB220_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26126,46 +26126,46 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB220_2 ; RV32IA-NEXT: .LBB220_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB220_7 ; RV32IA-NEXT: .LBB220_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB220_4 +; RV32IA-NEXT: beq a5, s0, .LBB220_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB220_5 ; RV32IA-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB220_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB220_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB220_1 ; RV32IA-NEXT: .LBB220_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26225,46 +26225,46 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB221_2 ; RV32I-NEXT: .LBB221_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB221_7 ; RV32I-NEXT: .LBB221_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB221_4 +; RV32I-NEXT: beq a5, s0, .LBB221_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB221_5 ; RV32I-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB221_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB221_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB221_1 ; RV32I-NEXT: .LBB221_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26279,46 +26279,46 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB221_2 ; RV32IA-NEXT: .LBB221_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB221_7 ; RV32IA-NEXT: .LBB221_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB221_4 +; RV32IA-NEXT: beq a5, s0, .LBB221_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB221_5 ; RV32IA-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB221_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB221_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB221_1 ; RV32IA-NEXT: .LBB221_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26383,46 +26383,46 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB222_2 ; RV32I-NEXT: .LBB222_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB222_7 ; RV32I-NEXT: .LBB222_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB222_4 +; RV32I-NEXT: beq a5, s0, .LBB222_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB222_5 ; RV32I-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB222_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB222_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB222_1 ; RV32I-NEXT: .LBB222_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26437,46 +26437,46 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB222_2 ; RV32IA-NEXT: .LBB222_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB222_7 ; RV32IA-NEXT: .LBB222_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB222_4 +; RV32IA-NEXT: beq a5, s0, .LBB222_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB222_5 ; RV32IA-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB222_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB222_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB222_1 ; RV32IA-NEXT: .LBB222_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26541,46 +26541,46 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB223_2 ; RV32I-NEXT: .LBB223_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB223_7 ; RV32I-NEXT: .LBB223_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB223_4 +; RV32I-NEXT: beq a5, s0, .LBB223_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB223_5 ; RV32I-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB223_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB223_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB223_1 ; RV32I-NEXT: .LBB223_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26595,46 +26595,46 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB223_2 ; RV32IA-NEXT: .LBB223_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB223_7 ; RV32IA-NEXT: .LBB223_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB223_4 +; RV32IA-NEXT: beq a5, s0, .LBB223_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB223_5 ; RV32IA-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB223_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB223_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB223_1 ; RV32IA-NEXT: .LBB223_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26699,46 +26699,46 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB224_2 ; RV32I-NEXT: .LBB224_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB224_7 ; RV32I-NEXT: .LBB224_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB224_4 +; RV32I-NEXT: beq a5, s0, .LBB224_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB224_5 ; RV32I-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB224_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB224_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB224_1 ; RV32I-NEXT: .LBB224_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26753,46 +26753,46 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB224_2 ; RV32IA-NEXT: .LBB224_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB224_7 ; RV32IA-NEXT: .LBB224_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB224_4 +; RV32IA-NEXT: beq a5, s0, .LBB224_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB224_5 ; RV32IA-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB224_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB224_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB224_1 ; RV32IA-NEXT: .LBB224_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26857,46 +26857,46 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB225_2 ; RV32I-NEXT: .LBB225_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB225_7 ; RV32I-NEXT: .LBB225_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB225_4 +; RV32I-NEXT: beq a5, s0, .LBB225_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB225_5 ; RV32I-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB225_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB225_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB225_1 ; RV32I-NEXT: .LBB225_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26911,46 +26911,46 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB225_2 ; RV32IA-NEXT: .LBB225_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB225_7 ; RV32IA-NEXT: .LBB225_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB225_4 +; RV32IA-NEXT: beq a5, s0, .LBB225_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB225_5 ; RV32IA-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB225_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB225_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB225_1 ; RV32IA-NEXT: .LBB225_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27010,46 +27010,46 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB226_2 ; RV32I-NEXT: .LBB226_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB226_7 ; RV32I-NEXT: .LBB226_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB226_4 +; RV32I-NEXT: beq a5, s0, .LBB226_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB226_5 ; RV32I-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB226_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB226_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB226_1 ; RV32I-NEXT: .LBB226_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27064,46 +27064,46 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB226_2 ; RV32IA-NEXT: .LBB226_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB226_7 ; RV32IA-NEXT: .LBB226_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB226_4 +; RV32IA-NEXT: beq a5, s0, .LBB226_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB226_5 ; RV32IA-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB226_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB226_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB226_1 ; RV32IA-NEXT: .LBB226_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27168,46 +27168,46 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB227_2 ; RV32I-NEXT: .LBB227_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB227_7 ; RV32I-NEXT: .LBB227_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB227_4 +; RV32I-NEXT: beq a5, s0, .LBB227_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB227_5 ; RV32I-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB227_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB227_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB227_1 ; RV32I-NEXT: .LBB227_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27222,46 +27222,46 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB227_2 ; RV32IA-NEXT: .LBB227_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB227_7 ; RV32IA-NEXT: .LBB227_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB227_4 +; RV32IA-NEXT: beq a5, s0, .LBB227_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB227_5 ; RV32IA-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB227_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB227_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB227_1 ; RV32IA-NEXT: .LBB227_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27326,46 +27326,46 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB228_2 ; RV32I-NEXT: .LBB228_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB228_7 ; RV32I-NEXT: .LBB228_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB228_4 +; RV32I-NEXT: beq a5, s0, .LBB228_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB228_5 ; RV32I-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB228_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB228_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB228_1 ; RV32I-NEXT: .LBB228_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27380,46 +27380,46 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB228_2 ; RV32IA-NEXT: .LBB228_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB228_7 ; RV32IA-NEXT: .LBB228_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB228_4 +; RV32IA-NEXT: beq a5, s0, .LBB228_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB228_5 ; RV32IA-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB228_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB228_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB228_1 ; RV32IA-NEXT: .LBB228_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27484,46 +27484,46 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB229_2 ; RV32I-NEXT: .LBB229_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB229_7 ; RV32I-NEXT: .LBB229_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB229_4 +; RV32I-NEXT: beq a5, s0, .LBB229_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB229_5 ; RV32I-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB229_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB229_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB229_1 ; RV32I-NEXT: .LBB229_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27538,46 +27538,46 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB229_2 ; RV32IA-NEXT: .LBB229_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB229_7 ; RV32IA-NEXT: .LBB229_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB229_4 +; RV32IA-NEXT: beq a5, s0, .LBB229_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB229_5 ; RV32IA-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB229_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB229_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB229_1 ; RV32IA-NEXT: .LBB229_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27642,46 +27642,46 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB230_2 ; RV32I-NEXT: .LBB230_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB230_7 ; RV32I-NEXT: .LBB230_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB230_4 +; RV32I-NEXT: beq a5, s0, .LBB230_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB230_5 ; RV32I-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB230_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB230_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB230_1 ; RV32I-NEXT: .LBB230_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27696,46 +27696,46 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB230_2 ; RV32IA-NEXT: .LBB230_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB230_7 ; RV32IA-NEXT: .LBB230_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB230_4 +; RV32IA-NEXT: beq a5, s0, .LBB230_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB230_5 ; RV32IA-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB230_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB230_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB230_1 ; RV32IA-NEXT: .LBB230_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27795,46 +27795,46 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB231_2 ; RV32I-NEXT: .LBB231_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB231_7 ; RV32I-NEXT: .LBB231_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB231_4 +; RV32I-NEXT: beq a5, s0, .LBB231_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB231_5 ; RV32I-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB231_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB231_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB231_1 ; RV32I-NEXT: .LBB231_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27849,46 +27849,46 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB231_2 ; RV32IA-NEXT: .LBB231_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB231_7 ; RV32IA-NEXT: .LBB231_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB231_4 +; RV32IA-NEXT: beq a5, s0, .LBB231_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB231_5 ; RV32IA-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB231_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB231_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB231_1 ; RV32IA-NEXT: .LBB231_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27953,46 +27953,46 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB232_2 ; RV32I-NEXT: .LBB232_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB232_7 ; RV32I-NEXT: .LBB232_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB232_4 +; RV32I-NEXT: beq a5, s0, .LBB232_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB232_5 ; RV32I-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB232_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB232_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB232_1 ; RV32I-NEXT: .LBB232_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28007,46 +28007,46 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB232_2 ; RV32IA-NEXT: .LBB232_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB232_7 ; RV32IA-NEXT: .LBB232_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB232_4 +; RV32IA-NEXT: beq a5, s0, .LBB232_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB232_5 ; RV32IA-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB232_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB232_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB232_1 ; RV32IA-NEXT: .LBB232_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28111,46 +28111,46 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB233_2 ; RV32I-NEXT: .LBB233_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB233_7 ; RV32I-NEXT: .LBB233_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB233_4 +; RV32I-NEXT: beq a5, s0, .LBB233_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB233_5 ; RV32I-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB233_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB233_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB233_1 ; RV32I-NEXT: .LBB233_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28165,46 +28165,46 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB233_2 ; RV32IA-NEXT: .LBB233_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB233_7 ; RV32IA-NEXT: .LBB233_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB233_4 +; RV32IA-NEXT: beq a5, s0, .LBB233_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB233_5 ; RV32IA-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB233_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB233_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB233_1 ; RV32IA-NEXT: .LBB233_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28269,46 +28269,46 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB234_2 ; RV32I-NEXT: .LBB234_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB234_7 ; RV32I-NEXT: .LBB234_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB234_4 +; RV32I-NEXT: beq a5, s0, .LBB234_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB234_5 ; RV32I-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB234_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB234_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB234_1 ; RV32I-NEXT: .LBB234_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28323,46 +28323,46 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB234_2 ; RV32IA-NEXT: .LBB234_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB234_7 ; RV32IA-NEXT: .LBB234_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB234_4 +; RV32IA-NEXT: beq a5, s0, .LBB234_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB234_5 ; RV32IA-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB234_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB234_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB234_1 ; RV32IA-NEXT: .LBB234_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28427,46 +28427,46 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB235_2 ; RV32I-NEXT: .LBB235_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB235_7 ; RV32I-NEXT: .LBB235_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB235_4 +; RV32I-NEXT: beq a5, s0, .LBB235_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB235_5 ; RV32I-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB235_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB235_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB235_1 ; RV32I-NEXT: .LBB235_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28481,46 +28481,46 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB235_2 ; RV32IA-NEXT: .LBB235_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB235_7 ; RV32IA-NEXT: .LBB235_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB235_4 +; RV32IA-NEXT: beq a5, s0, .LBB235_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB235_5 ; RV32IA-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB235_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB235_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB235_1 ; RV32IA-NEXT: .LBB235_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28580,46 +28580,46 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB236_2 ; RV32I-NEXT: .LBB236_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB236_7 ; RV32I-NEXT: .LBB236_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB236_4 +; RV32I-NEXT: beq a5, s0, .LBB236_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB236_5 ; RV32I-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB236_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB236_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB236_1 ; RV32I-NEXT: .LBB236_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28634,46 +28634,46 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB236_2 ; RV32IA-NEXT: .LBB236_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB236_7 ; RV32IA-NEXT: .LBB236_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB236_4 +; RV32IA-NEXT: beq a5, s0, .LBB236_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB236_5 ; RV32IA-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB236_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB236_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB236_1 ; RV32IA-NEXT: .LBB236_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28738,46 +28738,46 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB237_2 ; RV32I-NEXT: .LBB237_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB237_7 ; RV32I-NEXT: .LBB237_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB237_4 +; RV32I-NEXT: beq a5, s0, .LBB237_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB237_5 ; RV32I-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB237_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB237_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB237_1 ; RV32I-NEXT: .LBB237_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28792,46 +28792,46 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB237_2 ; RV32IA-NEXT: .LBB237_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB237_7 ; RV32IA-NEXT: .LBB237_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB237_4 +; RV32IA-NEXT: beq a5, s0, .LBB237_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB237_5 ; RV32IA-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB237_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB237_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB237_1 ; RV32IA-NEXT: .LBB237_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28896,46 +28896,46 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB238_2 ; RV32I-NEXT: .LBB238_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB238_7 ; RV32I-NEXT: .LBB238_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB238_4 +; RV32I-NEXT: beq a5, s0, .LBB238_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB238_5 ; RV32I-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB238_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB238_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB238_1 ; RV32I-NEXT: .LBB238_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28950,46 +28950,46 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB238_2 ; RV32IA-NEXT: .LBB238_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB238_7 ; RV32IA-NEXT: .LBB238_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB238_4 +; RV32IA-NEXT: beq a5, s0, .LBB238_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB238_5 ; RV32IA-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB238_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB238_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB238_1 ; RV32IA-NEXT: .LBB238_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -29054,46 +29054,46 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB239_2 ; RV32I-NEXT: .LBB239_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB239_7 ; RV32I-NEXT: .LBB239_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB239_4 +; RV32I-NEXT: beq a5, s0, .LBB239_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB239_5 ; RV32I-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB239_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB239_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB239_1 ; RV32I-NEXT: .LBB239_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -29108,46 +29108,46 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB239_2 ; RV32IA-NEXT: .LBB239_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB239_7 ; RV32IA-NEXT: .LBB239_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB239_4 +; RV32IA-NEXT: beq a5, s0, .LBB239_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB239_5 ; RV32IA-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB239_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB239_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB239_1 ; RV32IA-NEXT: .LBB239_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index 06594e35be870..aea7473ceece4 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -142,8 +142,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -176,8 +176,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -214,8 +214,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -248,8 +248,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -286,8 +286,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -320,8 +320,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -358,9 +358,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 +; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: or a1, a1, a3 ; RV32IA-NEXT: amoand.w a1, a1, (a2) @@ -386,9 +386,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 +; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: not a3, a3 -; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: or a1, a1, a3 ; RV64IA-NEXT: amoand.w a1, a1, (a2) @@ -418,8 +418,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -453,8 +453,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -626,23 +626,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB10_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB10_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB10_3: # in Loop: Header=BB10_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) @@ -700,23 +700,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a3, a0, 24 -; RV64IA-NEXT: li a4, 255 -; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: li a3, 255 ; RV64IA-NEXT: slli a1, a1, 56 +; RV64IA-NEXT: andi a4, a0, 24 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: srai a1, a1, 56 ; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: xori a3, a3, 56 +; RV64IA-NEXT: xori a4, a4, 56 ; RV64IA-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a5, (a2) -; RV64IA-NEXT: and a7, a5, a4 +; RV64IA-NEXT: and a7, a5, a3 ; RV64IA-NEXT: mv a6, a5 -; RV64IA-NEXT: sll a7, a7, a3 -; RV64IA-NEXT: sra a7, a7, a3 +; RV64IA-NEXT: sll a7, a7, a4 +; RV64IA-NEXT: sra a7, a7, a4 ; RV64IA-NEXT: bge a7, a1, .LBB10_3 ; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB10_1 Depth=1 ; RV64IA-NEXT: xor a6, a5, a1 -; RV64IA-NEXT: and a6, a6, a4 +; RV64IA-NEXT: and a6, a6, a3 ; RV64IA-NEXT: xor a6, a5, a6 ; RV64IA-NEXT: .LBB10_3: # in Loop: Header=BB10_1 Depth=1 ; RV64IA-NEXT: sc.w a6, a6, (a2) @@ -778,23 +778,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: li a3, 255 ; RV32IA-NEXT: slli a1, a1, 24 +; RV32IA-NEXT: andi a4, a0, 24 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: srai a1, a1, 24 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: xori a3, a3, 24 +; RV32IA-NEXT: xori a4, a4, 24 ; RV32IA-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a5, (a2) -; RV32IA-NEXT: and a7, a5, a4 +; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: mv a6, a5 -; RV32IA-NEXT: sll a7, a7, a3 -; RV32IA-NEXT: sra a7, a7, a3 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB11_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB11_1 Depth=1 ; RV32IA-NEXT: xor a6, a5, a1 -; RV32IA-NEXT: and a6, a6, a4 +; RV32IA-NEXT: and a6, a6, a3 ; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB11_3: # in Loop: Header=BB11_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) @@ -852,23 +852,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a3, a0, 24 -; RV64IA-NEXT: li a4, 255 -; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: li a3, 255 ; RV64IA-NEXT: slli a1, a1, 56 +; RV64IA-NEXT: andi a4, a0, 24 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: srai a1, a1, 56 ; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: xori a3, a3, 56 +; RV64IA-NEXT: xori a4, a4, 56 ; RV64IA-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a5, (a2) -; RV64IA-NEXT: and a7, a5, a4 +; RV64IA-NEXT: and a7, a5, a3 ; RV64IA-NEXT: mv a6, a5 -; RV64IA-NEXT: sll a7, a7, a3 -; RV64IA-NEXT: sra a7, a7, a3 +; RV64IA-NEXT: sll a7, a7, a4 +; RV64IA-NEXT: sra a7, a7, a4 ; RV64IA-NEXT: bge a1, a7, .LBB11_3 ; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB11_1 Depth=1 ; RV64IA-NEXT: xor a6, a5, a1 -; RV64IA-NEXT: and a6, a6, a4 +; RV64IA-NEXT: and a6, a6, a3 ; RV64IA-NEXT: xor a6, a5, a6 ; RV64IA-NEXT: .LBB11_3: # in Loop: Header=BB11_1 Depth=1 ; RV64IA-NEXT: sc.w a6, a6, (a2) @@ -929,8 +929,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -996,8 +996,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -1067,8 +1067,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: andi a1, a1, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a2) @@ -1134,8 +1134,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: andi a1, a1, 255 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: .LBB13_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a2) @@ -1400,10 +1400,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 ; RV32IA-NEXT: and a1, a1, a3 +; RV32IA-NEXT: not a3, a4 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a1, a4 +; RV32IA-NEXT: or a1, a1, a3 ; RV32IA-NEXT: amoand.w a1, a1, (a2) ; RV32IA-NEXT: srl a0, a1, a0 ; RV32IA-NEXT: slli a0, a0, 16 @@ -1429,10 +1429,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: addi a3, a3, -1 ; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: and a1, a1, a3 +; RV64IA-NEXT: not a3, a4 ; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a1, a4 +; RV64IA-NEXT: or a1, a1, a3 ; RV64IA-NEXT: amoand.w a1, a1, (a2) ; RV64IA-NEXT: srlw a0, a1, a0 ; RV64IA-NEXT: slli a0, a0, 48 @@ -1674,31 +1674,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a7, a1, .LBB21_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB21_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB21_3: # in Loop: Header=BB21_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB21_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: slli a0, a0, 16 ; RV32IA-NEXT: srai a0, a0, 16 ; RV32IA-NEXT: ret @@ -1750,31 +1750,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a3, a0, 24 -; RV64IA-NEXT: lui a4, 16 -; RV64IA-NEXT: addi a4, a4, -1 -; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: slli a1, a1, 48 +; RV64IA-NEXT: li a4, 48 +; RV64IA-NEXT: andi a5, a0, 24 +; RV64IA-NEXT: addi a3, a3, -1 ; RV64IA-NEXT: srai a1, a1, 48 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: li a5, 48 -; RV64IA-NEXT: sub a5, a5, a3 +; RV64IA-NEXT: sub a4, a4, a5 ; RV64IA-NEXT: .LBB21_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NEXT: lr.w a3, (a2) -; RV64IA-NEXT: and a7, a3, a4 -; RV64IA-NEXT: mv a6, a3 -; RV64IA-NEXT: sll a7, a7, a5 -; RV64IA-NEXT: sra a7, a7, a5 +; RV64IA-NEXT: lr.w a5, (a2) +; RV64IA-NEXT: and a7, a5, a3 +; RV64IA-NEXT: mv a6, a5 +; RV64IA-NEXT: sll a7, a7, a4 +; RV64IA-NEXT: sra a7, a7, a4 ; RV64IA-NEXT: bge a7, a1, .LBB21_3 ; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB21_1 Depth=1 -; RV64IA-NEXT: xor a6, a3, a1 -; RV64IA-NEXT: and a6, a6, a4 -; RV64IA-NEXT: xor a6, a3, a6 +; RV64IA-NEXT: xor a6, a5, a1 +; RV64IA-NEXT: and a6, a6, a3 +; RV64IA-NEXT: xor a6, a5, a6 ; RV64IA-NEXT: .LBB21_3: # in Loop: Header=BB21_1 Depth=1 ; RV64IA-NEXT: sc.w a6, a6, (a2) ; RV64IA-NEXT: bnez a6, .LBB21_1 ; RV64IA-NEXT: # %bb.4: -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: slli a0, a0, 48 ; RV64IA-NEXT: srai a0, a0, 48 ; RV64IA-NEXT: ret @@ -1830,31 +1830,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a3, a0, 24 -; RV32IA-NEXT: lui a4, 16 -; RV32IA-NEXT: addi a4, a4, -1 -; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: slli a1, a1, 16 +; RV32IA-NEXT: li a4, 16 +; RV32IA-NEXT: andi a5, a0, 24 +; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: srai a1, a1, 16 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: li a5, 16 -; RV32IA-NEXT: sub a5, a5, a3 +; RV32IA-NEXT: sub a4, a4, a5 ; RV32IA-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: lr.w a3, (a2) -; RV32IA-NEXT: and a7, a3, a4 -; RV32IA-NEXT: mv a6, a3 -; RV32IA-NEXT: sll a7, a7, a5 -; RV32IA-NEXT: sra a7, a7, a5 +; RV32IA-NEXT: lr.w a5, (a2) +; RV32IA-NEXT: and a7, a5, a3 +; RV32IA-NEXT: mv a6, a5 +; RV32IA-NEXT: sll a7, a7, a4 +; RV32IA-NEXT: sra a7, a7, a4 ; RV32IA-NEXT: bge a1, a7, .LBB22_3 ; RV32IA-NEXT: # %bb.2: # in Loop: Header=BB22_1 Depth=1 -; RV32IA-NEXT: xor a6, a3, a1 -; RV32IA-NEXT: and a6, a6, a4 -; RV32IA-NEXT: xor a6, a3, a6 +; RV32IA-NEXT: xor a6, a5, a1 +; RV32IA-NEXT: and a6, a6, a3 +; RV32IA-NEXT: xor a6, a5, a6 ; RV32IA-NEXT: .LBB22_3: # in Loop: Header=BB22_1 Depth=1 ; RV32IA-NEXT: sc.w a6, a6, (a2) ; RV32IA-NEXT: bnez a6, .LBB22_1 ; RV32IA-NEXT: # %bb.4: -; RV32IA-NEXT: srl a0, a3, a0 +; RV32IA-NEXT: srl a0, a5, a0 ; RV32IA-NEXT: slli a0, a0, 16 ; RV32IA-NEXT: srai a0, a0, 16 ; RV32IA-NEXT: ret @@ -1906,31 +1906,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a3, a0, 24 -; RV64IA-NEXT: lui a4, 16 -; RV64IA-NEXT: addi a4, a4, -1 -; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: slli a1, a1, 48 +; RV64IA-NEXT: li a4, 48 +; RV64IA-NEXT: andi a5, a0, 24 +; RV64IA-NEXT: addi a3, a3, -1 ; RV64IA-NEXT: srai a1, a1, 48 +; RV64IA-NEXT: sllw a3, a3, a0 ; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: li a5, 48 -; RV64IA-NEXT: sub a5, a5, a3 +; RV64IA-NEXT: sub a4, a4, a5 ; RV64IA-NEXT: .LBB22_1: # =>This Inner Loop Header: Depth=1 -; RV64IA-NEXT: lr.w a3, (a2) -; RV64IA-NEXT: and a7, a3, a4 -; RV64IA-NEXT: mv a6, a3 -; RV64IA-NEXT: sll a7, a7, a5 -; RV64IA-NEXT: sra a7, a7, a5 +; RV64IA-NEXT: lr.w a5, (a2) +; RV64IA-NEXT: and a7, a5, a3 +; RV64IA-NEXT: mv a6, a5 +; RV64IA-NEXT: sll a7, a7, a4 +; RV64IA-NEXT: sra a7, a7, a4 ; RV64IA-NEXT: bge a1, a7, .LBB22_3 ; RV64IA-NEXT: # %bb.2: # in Loop: Header=BB22_1 Depth=1 -; RV64IA-NEXT: xor a6, a3, a1 -; RV64IA-NEXT: and a6, a6, a4 -; RV64IA-NEXT: xor a6, a3, a6 +; RV64IA-NEXT: xor a6, a5, a1 +; RV64IA-NEXT: and a6, a6, a3 +; RV64IA-NEXT: xor a6, a5, a6 ; RV64IA-NEXT: .LBB22_3: # in Loop: Header=BB22_1 Depth=1 ; RV64IA-NEXT: sc.w a6, a6, (a2) ; RV64IA-NEXT: bnez a6, .LBB22_1 ; RV64IA-NEXT: # %bb.4: -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: slli a0, a0, 48 ; RV64IA-NEXT: srai a0, a0, 48 ; RV64IA-NEXT: ret @@ -3182,46 +3182,46 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB43_2 ; RV32I-NEXT: .LBB43_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB43_7 ; RV32I-NEXT: .LBB43_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB43_4 +; RV32I-NEXT: beq a5, s0, .LBB43_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB43_5 ; RV32I-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB43_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB43_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB43_1 ; RV32I-NEXT: .LBB43_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3236,46 +3236,46 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB43_2 ; RV32IA-NEXT: .LBB43_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB43_7 ; RV32IA-NEXT: .LBB43_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB43_4 +; RV32IA-NEXT: beq a5, s0, .LBB43_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB43_5 ; RV32IA-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB43_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB43_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB43_1 ; RV32IA-NEXT: .LBB43_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3335,46 +3335,46 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB44_2 ; RV32I-NEXT: .LBB44_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB44_7 ; RV32I-NEXT: .LBB44_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB44_4 +; RV32I-NEXT: beq a5, s0, .LBB44_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a4 +; RV32I-NEXT: slt a0, s0, a5 ; RV32I-NEXT: j .LBB44_5 ; RV32I-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB44_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB44_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB44_1 ; RV32I-NEXT: .LBB44_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3389,46 +3389,46 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB44_2 ; RV32IA-NEXT: .LBB44_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB44_7 ; RV32IA-NEXT: .LBB44_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB44_4 +; RV32IA-NEXT: beq a5, s0, .LBB44_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a4 +; RV32IA-NEXT: slt a0, s0, a5 ; RV32IA-NEXT: j .LBB44_5 ; RV32IA-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB44_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB44_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB44_1 ; RV32IA-NEXT: .LBB44_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3488,46 +3488,46 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB45_2 ; RV32I-NEXT: .LBB45_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB45_7 ; RV32I-NEXT: .LBB45_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB45_4 +; RV32I-NEXT: beq a5, s0, .LBB45_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB45_5 ; RV32I-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB45_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: bnez a0, .LBB45_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB45_1 ; RV32I-NEXT: .LBB45_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3542,46 +3542,46 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB45_2 ; RV32IA-NEXT: .LBB45_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB45_7 ; RV32IA-NEXT: .LBB45_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB45_4 +; RV32IA-NEXT: beq a5, s0, .LBB45_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB45_5 ; RV32IA-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB45_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: bnez a0, .LBB45_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB45_1 ; RV32IA-NEXT: .LBB45_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3641,46 +3641,46 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB46_2 ; RV32I-NEXT: .LBB46_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB46_7 ; RV32I-NEXT: .LBB46_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB46_4 +; RV32I-NEXT: beq a5, s0, .LBB46_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a5 ; RV32I-NEXT: j .LBB46_5 ; RV32I-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a0, s2, a4 ; RV32I-NEXT: .LBB46_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, a5 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv a2, a4 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a0, .LBB46_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB46_1 ; RV32I-NEXT: .LBB46_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3695,46 +3695,46 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB46_2 ; RV32IA-NEXT: .LBB46_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB46_7 ; RV32IA-NEXT: .LBB46_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB46_4 +; RV32IA-NEXT: beq a5, s0, .LBB46_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a5 ; RV32IA-NEXT: j .LBB46_5 ; RV32IA-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a0, s2, a4 ; RV32IA-NEXT: .LBB46_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: mv a2, a5 -; RV32IA-NEXT: mv a3, a4 +; RV32IA-NEXT: mv a2, a4 +; RV32IA-NEXT: mv a3, a5 ; RV32IA-NEXT: beqz a0, .LBB46_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB46_1 ; RV32IA-NEXT: .LBB46_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3806,10 +3806,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a2, a2, a0 ; RV32IA-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a5, (a3) @@ -3846,10 +3846,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp ; RV64IA-NEXT: andi a3, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a4, 255 -; RV64IA-NEXT: sllw a4, a4, a0 ; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: andi a2, a2, 255 +; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: sllw a2, a2, a0 ; RV64IA-NEXT: .LBB47_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a5, (a3) @@ -3890,10 +3890,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig ; RV32IA-NEXT: andi a3, a0, -4 ; RV32IA-NEXT: slli a0, a0, 3 ; RV32IA-NEXT: li a4, 255 -; RV32IA-NEXT: sll a4, a4, a0 ; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: andi a2, a2, 255 +; RV32IA-NEXT: sll a4, a4, a0 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a2, (a3) @@ -3929,10 +3929,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig ; RV64IA-NEXT: andi a3, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a4, 255 -; RV64IA-NEXT: sllw a4, a4, a0 ; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: andi a2, a2, 255 +; RV64IA-NEXT: sllw a4, a4, a0 +; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: sllw a0, a2, a0 ; RV64IA-NEXT: .LBB48_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a2, (a3) @@ -3977,8 +3977,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext % ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a2, a2, a0 ; RV32IA-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a4, (a3) @@ -4018,8 +4018,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext % ; RV64IA-NEXT: addi a4, a4, -1 ; RV64IA-NEXT: sllw a5, a4, a0 ; RV64IA-NEXT: and a1, a1, a4 -; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: and a2, a2, a4 +; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: sllw a2, a2, a0 ; RV64IA-NEXT: .LBB49_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a4, (a3) @@ -4063,8 +4063,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16 ; RV32IA-NEXT: addi a4, a4, -1 ; RV32IA-NEXT: sll a5, a4, a0 ; RV32IA-NEXT: and a1, a1, a4 -; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: and a2, a2, a4 +; RV32IA-NEXT: sll a1, a1, a0 ; RV32IA-NEXT: sll a0, a2, a0 ; RV32IA-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1 ; RV32IA-NEXT: lr.w a2, (a3) @@ -4103,8 +4103,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16 ; RV64IA-NEXT: addi a4, a4, -1 ; RV64IA-NEXT: sllw a5, a4, a0 ; RV64IA-NEXT: and a1, a1, a4 -; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: and a2, a2, a4 +; RV64IA-NEXT: sllw a1, a1, a0 ; RV64IA-NEXT: sllw a0, a2, a0 ; RV64IA-NEXT: .LBB50_1: # =>This Inner Loop Header: Depth=1 ; RV64IA-NEXT: lr.w a2, (a3) diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll index f22115130117a..34b29ea1dc6c2 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll @@ -63,8 +63,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a3, a0, 3 -; RV32IA-NEXT: andi a0, a3, 24 ; RV32IA-NEXT: li a4, 255 +; RV32IA-NEXT: andi a0, a3, 24 ; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 @@ -146,8 +146,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: li a5, 255 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: lw a3, 0(a2) ; RV64IA-NEXT: sllw a4, a5, a4 ; RV64IA-NEXT: not a4, a4 @@ -239,8 +239,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 -; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: lui a3, 16 +; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 @@ -329,8 +329,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a5, a0, 3 -; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: addiw a3, a3, -1 ; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: sllw a5, a3, a5 @@ -518,44 +518,44 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a4, s1 +; RV32I-NEXT: sltu a0, a5, s0 ; RV32I-NEXT: .LBB3_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 ; RV32I-NEXT: xori a0, a0, 1 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: and a1, a0, s2 -; RV32I-NEXT: sltu a2, a5, a1 -; RV32I-NEXT: and a0, a0, s1 -; RV32I-NEXT: sub a3, a4, a0 -; RV32I-NEXT: sub a3, a3, a2 -; RV32I-NEXT: sub a2, a5, a1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: sltu a3, a4, a1 +; RV32I-NEXT: sub a0, a5, a0 +; RV32I-NEXT: sub a2, a4, a1 +; RV32I-NEXT: sub a3, a0, a3 +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB3_5 ; RV32I-NEXT: .LBB3_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: bne a4, s1, .LBB3_1 +; RV32I-NEXT: bne a5, s0, .LBB3_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a5, s2 +; RV32I-NEXT: sltu a0, a4, s2 ; RV32I-NEXT: j .LBB3_2 ; RV32I-NEXT: .LBB3_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -580,44 +580,44 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s0, -8 ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB3_3 ; RV32IA-NEXT: .LBB3_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a4, s1 +; RV32IA-NEXT: sltu a0, a5, s0 ; RV32IA-NEXT: .LBB3_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 ; RV32IA-NEXT: xori a0, a0, 1 ; RV32IA-NEXT: neg a0, a0 ; RV32IA-NEXT: and a1, a0, s2 -; RV32IA-NEXT: sltu a2, a5, a1 -; RV32IA-NEXT: and a0, a0, s1 -; RV32IA-NEXT: sub a3, a4, a0 -; RV32IA-NEXT: sub a3, a3, a2 -; RV32IA-NEXT: sub a2, a5, a1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: and a0, a0, s0 +; RV32IA-NEXT: sltu a3, a4, a1 +; RV32IA-NEXT: sub a0, a5, a0 +; RV32IA-NEXT: sub a2, a4, a1 +; RV32IA-NEXT: sub a3, a0, a3 +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB3_5 ; RV32IA-NEXT: .LBB3_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: bne a4, s1, .LBB3_1 +; RV32IA-NEXT: bne a5, s0, .LBB3_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a5, s2 +; RV32IA-NEXT: sltu a0, a4, s2 ; RV32IA-NEXT: j .LBB3_2 ; RV32IA-NEXT: .LBB3_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -741,11 +741,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV32IA-LABEL: atomicrmw_usub_sat_i8: ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a3, a0, 3 -; RV32IA-NEXT: andi a0, a3, 24 -; RV32IA-NEXT: li a5, 255 +; RV32IA-NEXT: slli a0, a0, 3 +; RV32IA-NEXT: li a3, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: lw a4, 0(a2) -; RV32IA-NEXT: sll a3, a5, a3 +; RV32IA-NEXT: andi a0, a0, 24 ; RV32IA-NEXT: not a3, a3 ; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: .LBB4_1: # %atomicrmw.start @@ -818,11 +818,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_usub_sat_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 -; RV64IA-NEXT: li a5, 255 +; RV64IA-NEXT: slli a0, a0, 3 +; RV64IA-NEXT: li a3, 255 +; RV64IA-NEXT: sllw a4, a3, a0 ; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: sllw a4, a5, a4 +; RV64IA-NEXT: andi a0, a0, 24 ; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB4_1: # %atomicrmw.start @@ -906,8 +906,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 -; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: lui a3, 16 +; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 @@ -990,8 +990,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a5, a0, 3 -; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: addiw a3, a3, -1 ; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: sllw a5, a3, a5 @@ -1172,43 +1172,43 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32I-NEXT: sltu a2, a4, a0 +; RV32I-NEXT: sltu a2, a5, a0 ; RV32I-NEXT: .LBB7_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_3 Depth=1 ; RV32I-NEXT: addi a3, a2, -1 ; RV32I-NEXT: and a2, a3, a1 ; RV32I-NEXT: and a3, a3, a0 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB7_5 ; RV32I-NEXT: .LBB7_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: sltu a0, a5, s2 -; RV32I-NEXT: sub a1, a4, s1 +; RV32I-NEXT: sltu a0, a4, s2 +; RV32I-NEXT: sub a1, a5, s0 ; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: sub a1, a5, s2 -; RV32I-NEXT: bne a0, a4, .LBB7_1 +; RV32I-NEXT: sub a1, a4, s2 +; RV32I-NEXT: bne a0, a5, .LBB7_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB7_3 Depth=1 -; RV32I-NEXT: sltu a2, a5, a1 +; RV32I-NEXT: sltu a2, a4, a1 ; RV32I-NEXT: j .LBB7_2 ; RV32I-NEXT: .LBB7_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1233,43 +1233,43 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s0, -8 ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB7_3 ; RV32IA-NEXT: .LBB7_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32IA-NEXT: sltu a2, a4, a0 +; RV32IA-NEXT: sltu a2, a5, a0 ; RV32IA-NEXT: .LBB7_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_3 Depth=1 ; RV32IA-NEXT: addi a3, a2, -1 ; RV32IA-NEXT: and a2, a3, a1 ; RV32IA-NEXT: and a3, a3, a0 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB7_5 ; RV32IA-NEXT: .LBB7_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: sltu a0, a5, s2 -; RV32IA-NEXT: sub a1, a4, s1 +; RV32IA-NEXT: sltu a0, a4, s2 +; RV32IA-NEXT: sub a1, a5, s0 ; RV32IA-NEXT: sub a0, a1, a0 -; RV32IA-NEXT: sub a1, a5, s2 -; RV32IA-NEXT: bne a0, a4, .LBB7_1 +; RV32IA-NEXT: sub a1, a4, s2 +; RV32IA-NEXT: bne a0, a5, .LBB7_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB7_3 Depth=1 -; RV32IA-NEXT: sltu a2, a5, a1 +; RV32IA-NEXT: sltu a2, a4, a1 ; RV32IA-NEXT: j .LBB7_2 ; RV32IA-NEXT: .LBB7_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 14a1e6b982717..3ff01e4987bd5 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -57,11 +57,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV32IA-LABEL: atomicrmw_uinc_wrap_i8: ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a3, a0, 3 -; RV32IA-NEXT: andi a0, a3, 24 -; RV32IA-NEXT: li a5, 255 +; RV32IA-NEXT: slli a0, a0, 3 +; RV32IA-NEXT: li a3, 255 +; RV32IA-NEXT: sll a3, a3, a0 ; RV32IA-NEXT: lw a4, 0(a2) -; RV32IA-NEXT: sll a3, a5, a3 +; RV32IA-NEXT: andi a0, a0, 24 ; RV32IA-NEXT: not a3, a3 ; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: .LBB0_1: # %atomicrmw.start @@ -135,11 +135,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_uinc_wrap_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 -; RV64IA-NEXT: li a5, 255 +; RV64IA-NEXT: slli a0, a0, 3 +; RV64IA-NEXT: li a3, 255 +; RV64IA-NEXT: sllw a4, a3, a0 ; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: sllw a4, a5, a4 +; RV64IA-NEXT: andi a0, a0, 24 ; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start @@ -224,8 +224,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 -; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: lui a3, 16 +; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 @@ -239,8 +239,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV32IA-NEXT: and a7, a5, a3 ; RV32IA-NEXT: addi a5, a5, 1 ; RV32IA-NEXT: sltu a7, a7, a1 -; RV32IA-NEXT: neg a7, a7 ; RV32IA-NEXT: and a5, a5, a3 +; RV32IA-NEXT: neg a7, a7 ; RV32IA-NEXT: and a5, a7, a5 ; RV32IA-NEXT: sll a5, a5, a0 ; RV32IA-NEXT: and a7, a6, a4 @@ -309,8 +309,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a5, a0, 3 -; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: addiw a3, a3, -1 ; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: sllw a5, a3, a5 @@ -324,8 +324,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: and t0, a6, a3 ; RV64IA-NEXT: addi a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 -; RV64IA-NEXT: negw t0, t0 ; RV64IA-NEXT: and a6, a6, a3 +; RV64IA-NEXT: negw t0, t0 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 ; RV64IA-NEXT: and a4, a4, a5 @@ -493,42 +493,42 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a4, s1 +; RV32I-NEXT: sltu a0, a5, s0 ; RV32I-NEXT: .LBB3_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: addi a1, a5, 1 -; RV32I-NEXT: seqz a2, a1 -; RV32I-NEXT: add a3, a4, a2 +; RV32I-NEXT: addi a1, a4, 1 ; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: seqz a3, a1 ; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: add a3, a5, a3 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) +; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 12(sp) ; RV32I-NEXT: bnez a0, .LBB3_5 ; RV32I-NEXT: .LBB3_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: bne a4, s1, .LBB3_1 +; RV32I-NEXT: bne a5, s0, .LBB3_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a5, s2 +; RV32I-NEXT: sltu a0, a4, s2 ; RV32I-NEXT: j .LBB3_2 ; RV32I-NEXT: .LBB3_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -553,42 +553,42 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s0, -8 ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 -; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 +; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 4(a0) ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB3_3 ; RV32IA-NEXT: .LBB3_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a4, s1 +; RV32IA-NEXT: sltu a0, a5, s0 ; RV32IA-NEXT: .LBB3_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: addi a1, a5, 1 -; RV32IA-NEXT: seqz a2, a1 -; RV32IA-NEXT: add a3, a4, a2 +; RV32IA-NEXT: addi a1, a4, 1 ; RV32IA-NEXT: neg a0, a0 +; RV32IA-NEXT: seqz a3, a1 ; RV32IA-NEXT: and a2, a0, a1 +; RV32IA-NEXT: add a3, a5, a3 ; RV32IA-NEXT: and a3, a0, a3 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a4, 8(sp) +; RV32IA-NEXT: sw a5, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) +; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB3_5 ; RV32IA-NEXT: .LBB3_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: bne a4, s1, .LBB3_1 +; RV32IA-NEXT: bne a5, s0, .LBB3_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a5, s2 +; RV32IA-NEXT: sltu a0, a4, s2 ; RV32IA-NEXT: j .LBB3_2 ; RV32IA-NEXT: .LBB3_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 +; RV32IA-NEXT: mv a0, a4 +; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -726,8 +726,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a3, a0, 3 -; RV32IA-NEXT: andi a0, a3, 24 ; RV32IA-NEXT: li a4, 255 +; RV32IA-NEXT: andi a0, a3, 24 ; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 @@ -827,8 +827,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: li a5, 255 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: lw a3, 0(a2) ; RV64IA-NEXT: sllw a4, a5, a4 ; RV64IA-NEXT: not a4, a4 @@ -938,8 +938,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 ; RV32IA-NEXT: slli a4, a0, 3 -; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: lui a3, 16 +; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 ; RV32IA-NEXT: lw a7, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 @@ -1046,8 +1046,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a5, a0, 3 -; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: addiw a3, a3, -1 ; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: sllw a5, a3, a5 @@ -1281,10 +1281,10 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lw a5, 0(a0) ; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB7_2 ; RV32I-NEXT: .LBB7_1: # %atomicrmw.start @@ -1294,17 +1294,17 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_8 ; RV32I-NEXT: lw a5, 8(sp) ; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB7_7 ; RV32I-NEXT: .LBB7_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s1, .LBB7_4 +; RV32I-NEXT: beq a4, s0, .LBB7_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a4 +; RV32I-NEXT: sltu a0, s0, a4 ; RV32I-NEXT: j .LBB7_5 ; RV32I-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 ; RV32I-NEXT: sltu a0, s2, a5 @@ -1314,7 +1314,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: mv a2, s2 -; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: bnez a0, .LBB7_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 @@ -1349,10 +1349,10 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s0, -8 ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 -; RV32IA-NEXT: mv s0, a0 +; RV32IA-NEXT: mv s0, a2 +; RV32IA-NEXT: mv s1, a0 ; RV32IA-NEXT: lw a5, 0(a0) ; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB7_2 ; RV32IA-NEXT: .LBB7_1: # %atomicrmw.start @@ -1362,17 +1362,17 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s0 +; RV32IA-NEXT: mv a0, s1 ; RV32IA-NEXT: call __atomic_compare_exchange_8 ; RV32IA-NEXT: lw a5, 8(sp) ; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB7_7 ; RV32IA-NEXT: .LBB7_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s1, .LBB7_4 +; RV32IA-NEXT: beq a4, s0, .LBB7_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a4 +; RV32IA-NEXT: sltu a0, s0, a4 ; RV32IA-NEXT: j .LBB7_5 ; RV32IA-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 ; RV32IA-NEXT: sltu a0, s2, a5 @@ -1382,7 +1382,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: seqz a1, a1 ; RV32IA-NEXT: or a0, a1, a0 ; RV32IA-NEXT: mv a2, s2 -; RV32IA-NEXT: mv a3, s1 +; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: bnez a0, .LBB7_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 diff --git a/llvm/test/CodeGen/RISCV/avgceils.ll b/llvm/test/CodeGen/RISCV/avgceils.ll index 2ff4ad3b3b462..64410fad6029a 100644 --- a/llvm/test/CodeGen/RISCV/avgceils.ll +++ b/llvm/test/CodeGen/RISCV/avgceils.ll @@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; RV32I-LABEL: test_fixed_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a0, a0, 1 @@ -23,8 +23,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; RV64I-LABEL: test_fixed_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -41,8 +41,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; RV32I-LABEL: test_ext_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a0, a0, 1 @@ -52,8 +52,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; RV64I-LABEL: test_ext_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -72,8 +72,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; RV32I-LABEL: test_fixed_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a0, a0, 1 @@ -83,8 +83,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; RV64I-LABEL: test_fixed_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -101,8 +101,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; RV32I-LABEL: test_ext_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a0, a0, 1 @@ -112,8 +112,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; RV64I-LABEL: test_ext_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -183,13 +183,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: or a4, a1, a3 ; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srai a3, a1, 1 -; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a3, a0, a2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srai a2, a1, 1 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: or a3, a3, a1 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: sltu a1, a0, a3 ; RV32I-NEXT: sub a1, a4, a1 ; RV32I-NEXT: sub a0, a0, a3 @@ -214,13 +214,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: or a4, a1, a3 ; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srai a3, a1, 1 -; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a3, a0, a2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srai a2, a1, 1 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: or a3, a3, a1 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: sltu a1, a0, a3 ; RV32I-NEXT: sub a1, a4, a1 ; RV32I-NEXT: sub a0, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/avgceilu.ll b/llvm/test/CodeGen/RISCV/avgceilu.ll index cc12b585036ab..924a50a836dda 100644 --- a/llvm/test/CodeGen/RISCV/avgceilu.ll +++ b/llvm/test/CodeGen/RISCV/avgceilu.ll @@ -132,8 +132,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind { ; RV64I-LABEL: test_fixed_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -158,8 +158,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { ; RV64I-LABEL: test_ext_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a0, a0, 1 @@ -179,13 +179,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: or a4, a1, a3 ; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srli a3, a1, 1 -; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a3, a0, a2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: or a3, a3, a1 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: sltu a1, a0, a3 ; RV32I-NEXT: sub a1, a4, a1 ; RV32I-NEXT: sub a0, a0, a3 @@ -210,13 +210,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: or a4, a1, a3 ; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srli a3, a1, 1 -; RV32I-NEXT: sub a4, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a3, a0, a2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: or a3, a3, a1 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: sltu a1, a0, a3 ; RV32I-NEXT: sub a1, a4, a1 ; RV32I-NEXT: sub a0, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/avgfloors.ll b/llvm/test/CodeGen/RISCV/avgfloors.ll index b36177de021d1..b321f4c2f2939 100644 --- a/llvm/test/CodeGen/RISCV/avgfloors.ll +++ b/llvm/test/CodeGen/RISCV/avgfloors.ll @@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; RV32I-LABEL: test_fixed_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srai a0, a0, 1 @@ -22,8 +22,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; RV64I-LABEL: test_fixed_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srai a0, a0, 1 @@ -39,8 +39,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; RV32I-LABEL: test_ext_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srai a0, a0, 1 @@ -49,8 +49,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; RV64I-LABEL: test_ext_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 56 -; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srai a0, a0, 1 @@ -67,8 +67,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; RV32I-LABEL: test_fixed_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srai a0, a0, 1 @@ -77,8 +77,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; RV64I-LABEL: test_fixed_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srai a0, a0, 1 @@ -94,8 +94,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; RV32I-LABEL: test_ext_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srai a0, a0, 1 @@ -104,8 +104,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; RV64I-LABEL: test_ext_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srai a0, a0, 1 @@ -172,8 +172,8 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: srai a3, a1, 1 ; RV32I-NEXT: add a3, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a4, a0, a2 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a4, a4, 1 ; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: and a2, a0, a2 @@ -203,8 +203,8 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: srai a3, a1, 1 ; RV32I-NEXT: add a3, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: xor a4, a0, a2 +; RV32I-NEXT: slli a1, a1, 31 ; RV32I-NEXT: srli a4, a4, 1 ; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: and a2, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll index fa88c3760e455..550cc3136bbc3 100644 --- a/llvm/test/CodeGen/RISCV/avgflooru.ll +++ b/llvm/test/CodeGen/RISCV/avgflooru.ll @@ -122,8 +122,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind { ; RV64I-LABEL: test_fixed_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 1 @@ -147,8 +147,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { ; RV64I-LABEL: test_ext_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 1 @@ -164,20 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-LABEL: test_fixed_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: add a4, a3, a1 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a1, a0, a2 -; RV32I-NEXT: add a2, a4, a1 -; RV32I-NEXT: beq a2, a3, .LBB6_2 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: beq a1, a3, .LBB6_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: sltu a2, a1, a3 ; RV32I-NEXT: .LBB6_2: -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: srli a3, a2, 1 -; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: slli a2, a2, 31 +; RV32I-NEXT: srli a3, a1, 1 +; RV32I-NEXT: slli a4, a1, 31 ; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a3, a2 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_fixed_i64: @@ -197,20 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-LABEL: test_ext_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: add a4, a3, a1 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a1, a0, a2 -; RV32I-NEXT: add a2, a4, a1 -; RV32I-NEXT: beq a2, a3, .LBB7_2 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: beq a1, a3, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: sltu a2, a1, a3 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: srli a3, a2, 1 -; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: slli a2, a2, 31 +; RV32I-NEXT: srli a3, a1, 1 +; RV32I-NEXT: slli a4, a1, 31 ; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a3, a2 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ext_i64: diff --git a/llvm/test/CodeGen/RISCV/bf16-promote.ll b/llvm/test/CodeGen/RISCV/bf16-promote.ll index c17450a80de96..08c053fab4f67 100644 --- a/llvm/test/CodeGen/RISCV/bf16-promote.ll +++ b/llvm/test/CodeGen/RISCV/bf16-promote.ll @@ -110,13 +110,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64-NEXT: lhu a1, 0(a1) ; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lhu a0, 0(a0) -; RV64-NEXT: slli a1, a1, 16 -; RV64-NEXT: fmv.w.x fa5, a1 +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: lhu a1, 0(s0) ; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: fmv.w.x fa4, a0 +; RV64-NEXT: slli a1, a1, 16 +; RV64-NEXT: fmv.w.x fa5, a0 +; RV64-NEXT: fmv.w.x fa4, a1 ; RV64-NEXT: fadd.s fa0, fa4, fa5 ; RV64-NEXT: call __truncsfbf2 ; RV64-NEXT: fmv.x.w a0, fa0 @@ -131,13 +131,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: lhu a1, 0(a1) ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lhu a0, 0(a0) -; RV32-NEXT: slli a1, a1, 16 -; RV32-NEXT: fmv.w.x fa5, a1 +; RV32-NEXT: lhu a0, 0(a1) +; RV32-NEXT: lhu a1, 0(s0) ; RV32-NEXT: slli a0, a0, 16 -; RV32-NEXT: fmv.w.x fa4, a0 +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: fmv.w.x fa5, a0 +; RV32-NEXT: fmv.w.x fa4, a1 ; RV32-NEXT: fadd.s fa0, fa4, fa5 ; RV32-NEXT: call __truncsfbf2 ; RV32-NEXT: fmv.x.w a0, fa0 diff --git a/llvm/test/CodeGen/RISCV/bfloat-arith.ll b/llvm/test/CodeGen/RISCV/bfloat-arith.ll index b688af4234e65..871b43e61df50 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-arith.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-arith.ll @@ -102,11 +102,11 @@ define i32 @fneg_bf16(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: fneg_bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: lui a0, 1048568 ; CHECK-NEXT: fadd.s fa5, fa5, fa5 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: fmv.h.x fa4, a0 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 @@ -124,12 +124,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind { ; RV32IZFBFMIN: # %bb.0: ; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 ; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 +; RV32IZFBFMIN-NEXT: lui a0, 1048568 ; RV32IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV32IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 -; RV32IZFBFMIN-NEXT: fmv.x.h a0, fa5 -; RV32IZFBFMIN-NEXT: not a0, a0 -; RV32IZFBFMIN-NEXT: lui a1, 1048568 -; RV32IZFBFMIN-NEXT: and a0, a0, a1 +; RV32IZFBFMIN-NEXT: fmv.x.h a1, fa5 +; RV32IZFBFMIN-NEXT: not a1, a1 +; RV32IZFBFMIN-NEXT: and a0, a1, a0 ; RV32IZFBFMIN-NEXT: fmv.x.h a1, fa0 ; RV32IZFBFMIN-NEXT: slli a1, a1, 17 ; RV32IZFBFMIN-NEXT: srli a1, a1, 17 @@ -141,12 +141,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind { ; RV64IZFBFMIN: # %bb.0: ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 +; RV64IZFBFMIN-NEXT: lui a0, 1048568 ; RV64IZFBFMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV64IZFBFMIN-NEXT: fcvt.bf16.s fa5, fa5 -; RV64IZFBFMIN-NEXT: fmv.x.h a0, fa5 -; RV64IZFBFMIN-NEXT: not a0, a0 -; RV64IZFBFMIN-NEXT: lui a1, 1048568 -; RV64IZFBFMIN-NEXT: and a0, a0, a1 +; RV64IZFBFMIN-NEXT: fmv.x.h a1, fa5 +; RV64IZFBFMIN-NEXT: not a1, a1 +; RV64IZFBFMIN-NEXT: and a0, a1, a0 ; RV64IZFBFMIN-NEXT: fmv.x.h a1, fa0 ; RV64IZFBFMIN-NEXT: slli a1, a1, 49 ; RV64IZFBFMIN-NEXT: srli a1, a1, 49 @@ -247,16 +247,16 @@ define bfloat @fmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa2 ; CHECK-NEXT: fmv.w.x fa4, zero +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: fmadd.s fa5, fa4, fa3, fa5 ; CHECK-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK-NEXT: ret %c_ = fadd bfloat 0.0, %c ; avoid negation using xor @@ -270,17 +270,17 @@ define bfloat @fnmadd_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fadd.s fa4, fa3, fa4 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 -; CHECK-NEXT: fmv.h.x fa5, a0 -; CHECK-NEXT: fmv.x.h a0, fa4 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: fmv.x.h a2, fa4 +; CHECK-NEXT: xor a1, a1, a0 +; CHECK-NEXT: xor a0, a2, a0 +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: fmv.h.x fa4, a0 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 @@ -301,17 +301,17 @@ define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 ; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fadd.s fa4, fa3, fa4 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.bf16.s fa4, fa4 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 -; CHECK-NEXT: fmv.h.x fa5, a0 -; CHECK-NEXT: fmv.x.h a0, fa4 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: fmv.x.h a2, fa4 +; CHECK-NEXT: xor a1, a1, a0 +; CHECK-NEXT: xor a0, a2, a0 +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: fmv.h.x fa4, a0 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 @@ -369,16 +369,16 @@ define bfloat @fnmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK-NEXT: fmv.w.x fa4, zero +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 -; CHECK-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 +; CHECK-NEXT: fmadd.s fa5, fa5, fa4, fa3 ; CHECK-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK-NEXT: ret %a_ = fadd bfloat 0.0, %a @@ -392,16 +392,16 @@ define bfloat @fnmsub_bf16_2(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 ; CHECK-NEXT: fmv.w.x fa4, zero +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fcvt.s.bf16 fa3, fa2 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa2 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: fmadd.s fa5, fa4, fa5, fa3 ; CHECK-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK-NEXT: ret %b_ = fadd bfloat 0.0, %b @@ -432,11 +432,11 @@ define bfloat @fmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa2 ; CHECK-NEXT: fmv.w.x fa4, zero +; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa2, fa0 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-NEXT: fmul.s fa4, fa2, fa3 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa4, fa1 -; CHECK-NEXT: fcvt.s.bf16 fa3, fa0 -; CHECK-NEXT: fmul.s fa4, fa3, fa4 ; CHECK-NEXT: fcvt.bf16.s fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 @@ -454,21 +454,21 @@ define bfloat @fnmadd_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 -; CHECK-NEXT: fadd.s fa3, fa3, fa4 -; CHECK-NEXT: fcvt.bf16.s fa3, fa3 ; CHECK-NEXT: fcvt.s.bf16 fa2, fa2 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: fadd.s fa5, fa5, fa4 +; CHECK-NEXT: fadd.s fa3, fa3, fa4 ; CHECK-NEXT: fadd.s fa4, fa2, fa4 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.bf16.s fa3, fa3 ; CHECK-NEXT: fcvt.bf16.s fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa3, fa3 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 ; CHECK-NEXT: fmul.s fa5, fa5, fa3 ; CHECK-NEXT: fcvt.bf16.s fa5, fa5 -; CHECK-NEXT: fmv.x.h a0, fa5 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: fmv.x.h a1, fa5 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 @@ -489,10 +489,10 @@ define bfloat @fnmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK-NEXT: fmv.w.x fa4, zero -; CHECK-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.s.bf16 fa3, fa1 +; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fadd.s fa4, fa3, fa4 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 ; CHECK-NEXT: fcvt.bf16.s fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 ; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 diff --git a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll index 243c7d463661a..51ea8873d8c03 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll @@ -292,9 +292,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind { ; RV32IZFBFMIN-LABEL: br_fcmp_ord: ; RV32IZFBFMIN: # %bb.0: ; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV32IZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; RV32IZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; RV32IZFBFMIN-NEXT: feq.s a1, fa4, fa4 ; RV32IZFBFMIN-NEXT: and a0, a1, a0 ; RV32IZFBFMIN-NEXT: bnez a0, .LBB8_2 ; RV32IZFBFMIN-NEXT: # %bb.1: # %if.else @@ -307,9 +307,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind { ; RV64IZFBFMIN-LABEL: br_fcmp_ord: ; RV64IZFBFMIN: # %bb.0: ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV64IZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; RV64IZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; RV64IZFBFMIN-NEXT: feq.s a1, fa4, fa4 ; RV64IZFBFMIN-NEXT: and a0, a1, a0 ; RV64IZFBFMIN-NEXT: bnez a0, .LBB8_2 ; RV64IZFBFMIN-NEXT: # %bb.1: # %if.else @@ -545,9 +545,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind { ; RV32IZFBFMIN-LABEL: br_fcmp_uno: ; RV32IZFBFMIN: # %bb.0: ; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV32IZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; RV32IZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; RV32IZFBFMIN-NEXT: feq.s a1, fa4, fa4 ; RV32IZFBFMIN-NEXT: and a0, a1, a0 ; RV32IZFBFMIN-NEXT: beqz a0, .LBB15_2 ; RV32IZFBFMIN-NEXT: # %bb.1: # %if.else @@ -560,9 +560,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind { ; RV64IZFBFMIN-LABEL: br_fcmp_uno: ; RV64IZFBFMIN: # %bb.0: ; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa1 +; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 ; RV64IZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; RV64IZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; RV64IZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; RV64IZFBFMIN-NEXT: feq.s a1, fa4, fa4 ; RV64IZFBFMIN-NEXT: and a0, a1, a0 ; RV64IZFBFMIN-NEXT: beqz a0, .LBB15_2 ; RV64IZFBFMIN-NEXT: # %bb.1: # %if.else diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index c2c21a30d4e4c..82359769c7c22 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -51,13 +51,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK32ZFBFMIN: # %bb.0: # %start ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK32ZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK32ZFBFMIN-NEXT: neg a0, a0 -; CHECK32ZFBFMIN-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; CHECK32ZFBFMIN-NEXT: lui a1, 815104 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, a1 +; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK32ZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; CHECK32ZFBFMIN-NEXT: lui a0, 815104 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, a0 ; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK32ZFBFMIN-NEXT: neg a0, a1 ; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32ZFBFMIN-NEXT: and a0, a0, a1 @@ -66,14 +66,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV32ID-LABEL: fcvt_si_bf16_sat: ; RV32ID: # %bb.0: # %start ; RV32ID-NEXT: fmv.x.w a0, fa0 -; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: fmv.w.x fa5, a0 -; RV32ID-NEXT: feq.s a0, fa5, fa5 +; RV32ID-NEXT: lui a1, 815104 +; RV32ID-NEXT: fmv.w.x fa5, a1 ; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) +; RV32ID-NEXT: slli a0, a0, 16 ; RV32ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32ID-NEXT: lui a1, 815104 -; RV32ID-NEXT: fmv.w.x fa3, a1 -; RV32ID-NEXT: fmax.s fa5, fa5, fa3 +; RV32ID-NEXT: fmv.w.x fa3, a0 +; RV32ID-NEXT: feq.s a0, fa3, fa3 +; RV32ID-NEXT: fmax.s fa5, fa3, fa5 ; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz @@ -83,13 +83,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK64ZFBFMIN: # %bb.0: # %start ; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK64ZFBFMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK64ZFBFMIN-NEXT: neg a0, a0 -; CHECK64ZFBFMIN-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; CHECK64ZFBFMIN-NEXT: lui a1, 815104 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, a1 +; CHECK64ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK64ZFBFMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; CHECK64ZFBFMIN-NEXT: lui a0, 815104 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, a0 ; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK64ZFBFMIN-NEXT: neg a0, a1 ; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64ZFBFMIN-NEXT: and a0, a0, a1 @@ -98,14 +98,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV64ID-LABEL: fcvt_si_bf16_sat: ; RV64ID: # %bb.0: # %start ; RV64ID-NEXT: fmv.x.w a0, fa0 -; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: fmv.w.x fa5, a0 -; RV64ID-NEXT: feq.s a0, fa5, fa5 +; RV64ID-NEXT: lui a1, 815104 +; RV64ID-NEXT: fmv.w.x fa5, a1 ; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) +; RV64ID-NEXT: slli a0, a0, 16 ; RV64ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64ID-NEXT: lui a1, 815104 -; RV64ID-NEXT: fmv.w.x fa3, a1 -; RV64ID-NEXT: fmax.s fa5, fa5, fa3 +; RV64ID-NEXT: fmv.w.x fa3, a0 +; RV64ID-NEXT: feq.s a0, fa3, fa3 +; RV64ID-NEXT: fmax.s fa5, fa3, fa5 ; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz @@ -163,10 +163,10 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind { ; ; RV32ID-LABEL: fcvt_ui_bf16_sat: ; RV32ID: # %bb.0: # %start +; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) +; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV32ID-NEXT: fmv.x.w a0, fa0 ; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: lui a1, %hi(.LCPI3_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a1) ; RV32ID-NEXT: fmv.w.x fa4, a0 ; RV32ID-NEXT: fmv.w.x fa3, zero ; RV32ID-NEXT: fmax.s fa4, fa4, fa3 @@ -187,10 +187,10 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind { ; ; RV64ID-LABEL: fcvt_ui_bf16_sat: ; RV64ID: # %bb.0: # %start +; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) +; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV64ID-NEXT: fmv.x.w a0, fa0 ; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: lui a1, %hi(.LCPI3_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a1) ; RV64ID-NEXT: fmv.w.x fa4, a0 ; RV64ID-NEXT: fmv.w.x fa3, zero ; RV64ID-NEXT: fmax.s fa4, fa4, fa3 @@ -466,7 +466,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32IZFBFMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFBFMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFBFMIN-NEXT: call __fixsfdi -; RV32IZFBFMIN-NEXT: lui a4, 524288 +; RV32IZFBFMIN-NEXT: lui a3, 524288 ; RV32IZFBFMIN-NEXT: lui a2, 524288 ; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_2 ; RV32IZFBFMIN-NEXT: # %bb.1: # %start @@ -474,19 +474,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32IZFBFMIN-NEXT: .LBB10_2: # %start ; RV32IZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IZFBFMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFBFMIN-NEXT: beqz a3, .LBB10_4 +; RV32IZFBFMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFBFMIN-NEXT: beqz a1, .LBB10_4 ; RV32IZFBFMIN-NEXT: # %bb.3: -; RV32IZFBFMIN-NEXT: addi a2, a4, -1 +; RV32IZFBFMIN-NEXT: addi a2, a3, -1 ; RV32IZFBFMIN-NEXT: .LBB10_4: # %start -; RV32IZFBFMIN-NEXT: feq.s a1, fs0, fs0 +; RV32IZFBFMIN-NEXT: feq.s a3, fs0, fs0 ; RV32IZFBFMIN-NEXT: neg a4, a1 -; RV32IZFBFMIN-NEXT: and a1, a4, a2 -; RV32IZFBFMIN-NEXT: neg a2, a3 -; RV32IZFBFMIN-NEXT: neg a3, s0 +; RV32IZFBFMIN-NEXT: neg a1, s0 +; RV32IZFBFMIN-NEXT: neg a3, a3 +; RV32IZFBFMIN-NEXT: and a0, a1, a0 +; RV32IZFBFMIN-NEXT: and a1, a3, a2 +; RV32IZFBFMIN-NEXT: or a0, a4, a0 ; RV32IZFBFMIN-NEXT: and a0, a3, a0 -; RV32IZFBFMIN-NEXT: or a0, a2, a0 -; RV32IZFBFMIN-NEXT: and a0, a4, a0 ; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFBFMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -505,7 +505,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; R32IDZFBFMIN-NEXT: fle.s s0, fa5, fs0 ; R32IDZFBFMIN-NEXT: fmv.s fa0, fs0 ; R32IDZFBFMIN-NEXT: call __fixsfdi -; R32IDZFBFMIN-NEXT: lui a4, 524288 +; R32IDZFBFMIN-NEXT: lui a3, 524288 ; R32IDZFBFMIN-NEXT: lui a2, 524288 ; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_2 ; R32IDZFBFMIN-NEXT: # %bb.1: # %start @@ -513,19 +513,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; R32IDZFBFMIN-NEXT: .LBB10_2: # %start ; R32IDZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) ; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; R32IDZFBFMIN-NEXT: flt.s a3, fa5, fs0 -; R32IDZFBFMIN-NEXT: beqz a3, .LBB10_4 +; R32IDZFBFMIN-NEXT: flt.s a1, fa5, fs0 +; R32IDZFBFMIN-NEXT: beqz a1, .LBB10_4 ; R32IDZFBFMIN-NEXT: # %bb.3: -; R32IDZFBFMIN-NEXT: addi a2, a4, -1 +; R32IDZFBFMIN-NEXT: addi a2, a3, -1 ; R32IDZFBFMIN-NEXT: .LBB10_4: # %start -; R32IDZFBFMIN-NEXT: feq.s a1, fs0, fs0 +; R32IDZFBFMIN-NEXT: feq.s a3, fs0, fs0 ; R32IDZFBFMIN-NEXT: neg a4, a1 -; R32IDZFBFMIN-NEXT: and a1, a4, a2 -; R32IDZFBFMIN-NEXT: neg a2, a3 -; R32IDZFBFMIN-NEXT: neg a3, s0 +; R32IDZFBFMIN-NEXT: neg a1, s0 +; R32IDZFBFMIN-NEXT: neg a3, a3 +; R32IDZFBFMIN-NEXT: and a0, a1, a0 +; R32IDZFBFMIN-NEXT: and a1, a3, a2 +; R32IDZFBFMIN-NEXT: or a0, a4, a0 ; R32IDZFBFMIN-NEXT: and a0, a3, a0 -; R32IDZFBFMIN-NEXT: or a0, a2, a0 -; R32IDZFBFMIN-NEXT: and a0, a4, a0 ; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; R32IDZFBFMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -546,7 +546,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: fle.s s0, fa5, fs0 ; RV32ID-NEXT: fmv.s fa0, fs0 ; RV32ID-NEXT: call __fixsfdi -; RV32ID-NEXT: lui a4, 524288 +; RV32ID-NEXT: lui a3, 524288 ; RV32ID-NEXT: lui a2, 524288 ; RV32ID-NEXT: beqz s0, .LBB10_2 ; RV32ID-NEXT: # %bb.1: # %start @@ -554,19 +554,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: .LBB10_2: # %start ; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32ID-NEXT: flt.s a3, fa5, fs0 -; RV32ID-NEXT: beqz a3, .LBB10_4 +; RV32ID-NEXT: flt.s a1, fa5, fs0 +; RV32ID-NEXT: beqz a1, .LBB10_4 ; RV32ID-NEXT: # %bb.3: -; RV32ID-NEXT: addi a2, a4, -1 +; RV32ID-NEXT: addi a2, a3, -1 ; RV32ID-NEXT: .LBB10_4: # %start -; RV32ID-NEXT: feq.s a1, fs0, fs0 +; RV32ID-NEXT: feq.s a3, fs0, fs0 ; RV32ID-NEXT: neg a4, a1 -; RV32ID-NEXT: and a1, a4, a2 -; RV32ID-NEXT: neg a2, a3 -; RV32ID-NEXT: neg a3, s0 +; RV32ID-NEXT: neg a1, s0 +; RV32ID-NEXT: neg a3, a3 +; RV32ID-NEXT: and a0, a1, a0 +; RV32ID-NEXT: and a1, a3, a2 +; RV32ID-NEXT: or a0, a4, a0 ; RV32ID-NEXT: and a0, a3, a0 -; RV32ID-NEXT: or a0, a2, a0 -; RV32ID-NEXT: and a0, a4, a0 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -650,15 +650,15 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK32ZFBFMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa0, fa0 -; CHECK32ZFBFMIN-NEXT: flt.s a0, fa5, fa0 -; CHECK32ZFBFMIN-NEXT: neg s0, a0 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa5, zero -; CHECK32ZFBFMIN-NEXT: fle.s a0, fa5, fa0 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero +; CHECK32ZFBFMIN-NEXT: fle.s a0, fa4, fa0 +; CHECK32ZFBFMIN-NEXT: flt.s a1, fa5, fa0 +; CHECK32ZFBFMIN-NEXT: neg s0, a1 ; CHECK32ZFBFMIN-NEXT: neg s1, a0 ; CHECK32ZFBFMIN-NEXT: call __fixunssfdi ; CHECK32ZFBFMIN-NEXT: and a0, s1, a0 -; CHECK32ZFBFMIN-NEXT: or a0, s0, a0 ; CHECK32ZFBFMIN-NEXT: and a1, s1, a1 +; CHECK32ZFBFMIN-NEXT: or a0, s0, a0 ; CHECK32ZFBFMIN-NEXT: or a1, s0, a1 ; CHECK32ZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32ZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -672,20 +672,20 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32ID-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32ID-NEXT: lui a0, %hi(.LCPI12_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI12_0)(a0) ; RV32ID-NEXT: fmv.x.w a0, fa0 +; RV32ID-NEXT: lui a1, %hi(.LCPI12_0) +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: flw fa4, %lo(.LCPI12_0)(a1) ; RV32ID-NEXT: slli a0, a0, 16 ; RV32ID-NEXT: fmv.w.x fa0, a0 -; RV32ID-NEXT: flt.s a0, fa5, fa0 -; RV32ID-NEXT: neg s0, a0 -; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: fle.s a0, fa5, fa0 +; RV32ID-NEXT: flt.s a1, fa4, fa0 +; RV32ID-NEXT: neg s0, a1 ; RV32ID-NEXT: neg s1, a0 ; RV32ID-NEXT: call __fixunssfdi ; RV32ID-NEXT: and a0, s1, a0 -; RV32ID-NEXT: or a0, s0, a0 ; RV32ID-NEXT: and a1, s1, a1 +; RV32ID-NEXT: or a0, s0, a0 ; RV32ID-NEXT: or a1, s0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1251,7 +1251,7 @@ define double @fcvt_d_bf16(bfloat %a) nounwind { ; ; R32IDZFBFMIN-LABEL: fcvt_d_bf16: ; R32IDZFBFMIN: # %bb.0: -; R32IDZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; R32IDZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0, dyn ; R32IDZFBFMIN-NEXT: fcvt.d.s fa0, fa5 ; R32IDZFBFMIN-NEXT: ret ; @@ -1275,7 +1275,7 @@ define double @fcvt_d_bf16(bfloat %a) nounwind { ; ; RV64IDZFBFMIN-LABEL: fcvt_d_bf16: ; RV64IDZFBFMIN: # %bb.0: -; RV64IDZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; RV64IDZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0, dyn ; RV64IDZFBFMIN-NEXT: fcvt.d.s fa0, fa5 ; RV64IDZFBFMIN-NEXT: ret ; @@ -1498,12 +1498,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK32ZFBFMIN: # %bb.0: # %start ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK32ZFBFMIN-NEXT: lui a0, 798720 +; CHECK32ZFBFMIN-NEXT: lui a1, 274400 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a0 ; CHECK32ZFBFMIN-NEXT: feq.s a0, fa5, fa5 ; CHECK32ZFBFMIN-NEXT: neg a0, a0 -; CHECK32ZFBFMIN-NEXT: lui a1, 798720 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 -; CHECK32ZFBFMIN-NEXT: lui a1, 274400 ; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.w.s a1, fa5, rtz @@ -1513,15 +1513,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind { ; RV32ID-LABEL: fcvt_w_s_sat_i8: ; RV32ID: # %bb.0: # %start ; RV32ID-NEXT: fmv.x.w a0, fa0 -; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: fmv.w.x fa5, a0 -; RV32ID-NEXT: feq.s a0, fa5, fa5 -; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: lui a1, 798720 -; RV32ID-NEXT: fmv.w.x fa4, a1 -; RV32ID-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-NEXT: fmv.w.x fa5, a1 ; RV32ID-NEXT: lui a1, 274400 +; RV32ID-NEXT: slli a0, a0, 16 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: feq.s a0, fa4, fa4 +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 ; RV32ID-NEXT: fmv.w.x fa4, a1 +; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 @@ -1530,12 +1530,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind { ; CHECK64ZFBFMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK64ZFBFMIN: # %bb.0: # %start ; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK64ZFBFMIN-NEXT: lui a0, 798720 +; CHECK64ZFBFMIN-NEXT: lui a1, 274400 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a0 ; CHECK64ZFBFMIN-NEXT: feq.s a0, fa5, fa5 ; CHECK64ZFBFMIN-NEXT: neg a0, a0 -; CHECK64ZFBFMIN-NEXT: lui a1, 798720 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 -; CHECK64ZFBFMIN-NEXT: lui a1, 274400 ; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.l.s a1, fa5, rtz @@ -1545,15 +1545,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind { ; RV64ID-LABEL: fcvt_w_s_sat_i8: ; RV64ID: # %bb.0: # %start ; RV64ID-NEXT: fmv.x.w a0, fa0 -; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: fmv.w.x fa5, a0 -; RV64ID-NEXT: feq.s a0, fa5, fa5 -; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: lui a1, 798720 -; RV64ID-NEXT: fmv.w.x fa4, a1 -; RV64ID-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-NEXT: fmv.w.x fa5, a1 ; RV64ID-NEXT: lui a1, 274400 +; RV64ID-NEXT: slli a0, a0, 16 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: feq.s a0, fa4, fa4 +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 ; RV64ID-NEXT: fmv.w.x fa4, a1 +; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 @@ -1601,8 +1601,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind { ; CHECK32ZFBFMIN: # %bb.0: # %start ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero -; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: lui a0, 276464 +; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a0 ; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.wu.s a0, fa5, rtz @@ -1611,11 +1611,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind { ; RV32ID-LABEL: fcvt_wu_s_sat_i8: ; RV32ID: # %bb.0: # %start ; RV32ID-NEXT: fmv.x.w a0, fa0 +; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: fmv.w.x fa5, a0 -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-NEXT: fmv.w.x fa4, a0 ; RV32ID-NEXT: lui a0, 276464 +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 ; RV32ID-NEXT: fmv.w.x fa4, a0 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz @@ -1625,8 +1625,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind { ; CHECK64ZFBFMIN: # %bb.0: # %start ; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 ; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, zero -; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: lui a0, 276464 +; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a0 ; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.lu.s a0, fa5, rtz @@ -1635,11 +1635,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind { ; RV64ID-LABEL: fcvt_wu_s_sat_i8: ; RV64ID: # %bb.0: # %start ; RV64ID-NEXT: fmv.x.w a0, fa0 +; RV64ID-NEXT: fmv.w.x fa5, zero ; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: fmv.w.x fa5, a0 -; RV64ID-NEXT: fmv.w.x fa4, zero -; RV64ID-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-NEXT: fmv.w.x fa4, a0 ; RV64ID-NEXT: lui a0, 276464 +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 ; RV64ID-NEXT: fmv.w.x fa4, a0 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz diff --git a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll index 9d5ba73de191d..cc572ce489f62 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll @@ -92,9 +92,9 @@ define i32 @fcmp_ord(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: fcmp_ord: ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 ; CHECK-NEXT: feq.s a0, fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: feq.s a1, fa5, fa5 +; CHECK-NEXT: feq.s a1, fa4, fa4 ; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: ret %1 = fcmp ord bfloat %a, %b @@ -186,9 +186,9 @@ define i32 @fcmp_uno(bfloat %a, bfloat %b) nounwind { ; CHECK-LABEL: fcmp_uno: ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 +; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 ; CHECK-NEXT: feq.s a0, fa5, fa5 -; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK-NEXT: feq.s a1, fa5, fa5 +; CHECK-NEXT: feq.s a1, fa4, fa4 ; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: xori a0, a0, 1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll index a9ef261bb9302..f9cf4e523b77d 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll @@ -50,10 +50,10 @@ define bfloat @flh_fsh_global(bfloat %a, bfloat %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: fcvt.s.bf16 fa5, fa1 ; CHECK-NEXT: fcvt.s.bf16 fa4, fa0 +; CHECK-NEXT: lui a0, %hi(G) ; CHECK-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-NEXT: flh fa4, %lo(G)(a0) ; CHECK-NEXT: fcvt.bf16.s fa0, fa5 -; CHECK-NEXT: lui a0, %hi(G) -; CHECK-NEXT: flh fa5, %lo(G)(a0) ; CHECK-NEXT: addi a1, a0, %lo(G) ; CHECK-NEXT: fsh fa0, %lo(G)(a0) ; CHECK-NEXT: flh fa5, 18(a1) diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll index 9dc8ce6be1ea6..c83b0ed6b0eee 100644 --- a/llvm/test/CodeGen/RISCV/bfloat.ll +++ b/llvm/test/CodeGen/RISCV/bfloat.ll @@ -342,8 +342,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: slli a1, a1, 16 -; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32-NEXT: slli a0, a0, 16 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32-NEXT: fadd.s fa5, fa4, fa5 ; RV32ID-ILP32-NEXT: fmv.x.w a0, fa5 @@ -359,8 +359,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: slli a1, a1, 16 -; RV64ID-LP64-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64-NEXT: slli a0, a0, 16 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64-NEXT: fadd.s fa5, fa4, fa5 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 @@ -378,8 +378,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { ; RV32ID-ILP32D-NEXT: fmv.x.w a0, fa0 ; RV32ID-ILP32D-NEXT: fmv.x.w a1, fa1 ; RV32ID-ILP32D-NEXT: slli a1, a1, 16 -; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: slli a0, a0, 16 +; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32D-NEXT: fadd.s fa0, fa4, fa5 ; RV32ID-ILP32D-NEXT: call __truncsfbf2 @@ -398,8 +398,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { ; RV64ID-LP64D-NEXT: fmv.x.w a0, fa0 ; RV64ID-LP64D-NEXT: fmv.x.w a1, fa1 ; RV64ID-LP64D-NEXT: slli a1, a1, 16 -; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: slli a0, a0, 16 +; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64D-NEXT: fadd.s fa0, fa4, fa5 ; RV64ID-LP64D-NEXT: call __truncsfbf2 @@ -450,8 +450,8 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV32ID-ILP32-NEXT: lhu a1, 6(a0) ; RV32ID-ILP32-NEXT: lhu a0, 0(a0) ; RV32ID-ILP32-NEXT: slli a1, a1, 16 -; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32-NEXT: slli a0, a0, 16 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32-NEXT: fadd.s fa5, fa4, fa5 ; RV32ID-ILP32-NEXT: fmv.x.w a0, fa5 @@ -469,8 +469,8 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV64ID-LP64-NEXT: lhu a1, 6(a0) ; RV64ID-LP64-NEXT: lhu a0, 0(a0) ; RV64ID-LP64-NEXT: slli a1, a1, 16 -; RV64ID-LP64-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64-NEXT: slli a0, a0, 16 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64-NEXT: fadd.s fa5, fa4, fa5 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 @@ -488,8 +488,8 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV32ID-ILP32D-NEXT: lhu a1, 6(a0) ; RV32ID-ILP32D-NEXT: lhu a0, 0(a0) ; RV32ID-ILP32D-NEXT: slli a1, a1, 16 -; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: slli a0, a0, 16 +; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32D-NEXT: fadd.s fa0, fa4, fa5 ; RV32ID-ILP32D-NEXT: call __truncsfbf2 @@ -508,8 +508,8 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV64ID-LP64D-NEXT: lhu a1, 6(a0) ; RV64ID-LP64D-NEXT: lhu a0, 0(a0) ; RV64ID-LP64D-NEXT: slli a1, a1, 16 -; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: slli a0, a0, 16 +; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64D-NEXT: fadd.s fa0, fa4, fa5 ; RV64ID-LP64D-NEXT: call __truncsfbf2 @@ -569,8 +569,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { ; RV32ID-ILP32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: mv s0, a0 ; RV32ID-ILP32-NEXT: slli a2, a2, 16 -; RV32ID-ILP32-NEXT: fmv.w.x fa5, a2 ; RV32ID-ILP32-NEXT: slli a1, a1, 16 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a2 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fadd.s fa5, fa4, fa5 ; RV32ID-ILP32-NEXT: fmv.x.w a0, fa5 @@ -589,8 +589,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { ; RV64ID-LP64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: mv s0, a0 ; RV64ID-LP64-NEXT: slli a2, a2, 16 -; RV64ID-LP64-NEXT: fmv.w.x fa5, a2 ; RV64ID-LP64-NEXT: slli a1, a1, 16 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a2 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fadd.s fa5, fa4, fa5 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 @@ -611,8 +611,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { ; RV32ID-ILP32D-NEXT: fmv.x.w a0, fa0 ; RV32ID-ILP32D-NEXT: fmv.x.w a1, fa1 ; RV32ID-ILP32D-NEXT: slli a1, a1, 16 -; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: slli a0, a0, 16 +; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32D-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32D-NEXT: fadd.s fa0, fa4, fa5 ; RV32ID-ILP32D-NEXT: call __truncsfbf2 @@ -633,8 +633,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { ; RV64ID-LP64D-NEXT: fmv.x.w a0, fa0 ; RV64ID-LP64D-NEXT: fmv.x.w a1, fa1 ; RV64ID-LP64D-NEXT: slli a1, a1, 16 -; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: slli a0, a0, 16 +; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 ; RV64ID-LP64D-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64D-NEXT: fadd.s fa0, fa4, fa5 ; RV64ID-LP64D-NEXT: call __truncsfbf2 diff --git a/llvm/test/CodeGen/RISCV/bitextract-mac.ll b/llvm/test/CodeGen/RISCV/bitextract-mac.ll index ce1e0c4711ffb..41a32656e3257 100644 --- a/llvm/test/CodeGen/RISCV/bitextract-mac.ll +++ b/llvm/test/CodeGen/RISCV/bitextract-mac.ll @@ -25,8 +25,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: mul a0, a1, a0 ; RV32I-NEXT: slli a1, a0, 26 -; RV32I-NEXT: srli a1, a1, 28 ; RV32I-NEXT: slli a0, a0, 20 +; RV32I-NEXT: srli a1, a1, 28 ; RV32I-NEXT: srli a0, a0, 25 ; RV32I-NEXT: mul a0, a1, a0 ; RV32I-NEXT: add a0, a0, a2 @@ -36,8 +36,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV32ZBB: # %bb.0: # %entry ; RV32ZBB-NEXT: mul a0, a1, a0 ; RV32ZBB-NEXT: slli a1, a0, 26 -; RV32ZBB-NEXT: srli a1, a1, 28 ; RV32ZBB-NEXT: slli a0, a0, 20 +; RV32ZBB-NEXT: srli a1, a1, 28 ; RV32ZBB-NEXT: srli a0, a0, 25 ; RV32ZBB-NEXT: mul a0, a1, a0 ; RV32ZBB-NEXT: add a0, a0, a2 @@ -56,8 +56,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV32XTHEADMAC: # %bb.0: # %entry ; RV32XTHEADMAC-NEXT: mul a0, a1, a0 ; RV32XTHEADMAC-NEXT: slli a1, a0, 26 -; RV32XTHEADMAC-NEXT: srli a1, a1, 28 ; RV32XTHEADMAC-NEXT: slli a0, a0, 20 +; RV32XTHEADMAC-NEXT: srli a1, a1, 28 ; RV32XTHEADMAC-NEXT: srli a0, a0, 25 ; RV32XTHEADMAC-NEXT: th.mulah a2, a1, a0 ; RV32XTHEADMAC-NEXT: mv a0, a2 @@ -76,8 +76,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: mul a0, a1, a0 ; RV64I-NEXT: slli a1, a0, 58 -; RV64I-NEXT: srli a1, a1, 60 ; RV64I-NEXT: slli a0, a0, 52 +; RV64I-NEXT: srli a1, a1, 60 ; RV64I-NEXT: srli a0, a0, 57 ; RV64I-NEXT: mul a0, a1, a0 ; RV64I-NEXT: addw a0, a0, a2 @@ -87,8 +87,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV64ZBB: # %bb.0: # %entry ; RV64ZBB-NEXT: mul a0, a1, a0 ; RV64ZBB-NEXT: slli a1, a0, 58 -; RV64ZBB-NEXT: srli a1, a1, 60 ; RV64ZBB-NEXT: slli a0, a0, 52 +; RV64ZBB-NEXT: srli a1, a1, 60 ; RV64ZBB-NEXT: srli a0, a0, 57 ; RV64ZBB-NEXT: mul a0, a1, a0 ; RV64ZBB-NEXT: addw a0, a0, a2 @@ -98,8 +98,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) { ; RV64XTHEADMAC: # %bb.0: # %entry ; RV64XTHEADMAC-NEXT: mul a0, a1, a0 ; RV64XTHEADMAC-NEXT: slli a1, a0, 58 -; RV64XTHEADMAC-NEXT: srli a1, a1, 60 ; RV64XTHEADMAC-NEXT: slli a0, a0, 52 +; RV64XTHEADMAC-NEXT: srli a1, a1, 60 ; RV64XTHEADMAC-NEXT: srli a0, a0, 57 ; RV64XTHEADMAC-NEXT: th.mulah a2, a1, a0 ; RV64XTHEADMAC-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll index d280e5ee46b7c..d69ab0550a034 100644 --- a/llvm/test/CodeGen/RISCV/bittest.ll +++ b/llvm/test/CodeGen/RISCV/bittest.ll @@ -751,10 +751,10 @@ define signext i32 @bit_31_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a0, 1024 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 1024 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB23_2 +; RV32-NEXT: beqz a3, .LBB23_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -779,11 +779,11 @@ define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I-LABEL: bit_10_nz_select_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a0, a0, 21 -; RV32I-NEXT: srli a6, a0, 31 ; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: slli a0, a0, 21 +; RV32I-NEXT: srli a3, a0, 31 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB24_2 +; RV32I-NEXT: bnez a3, .LBB24_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 @@ -802,10 +802,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32ZBS-LABEL: bit_10_nz_select_i64: ; RV32ZBS: # %bb.0: -; RV32ZBS-NEXT: bexti a6, a0, 10 ; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: bexti a3, a0, 10 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a6, .LBB24_2 +; RV32ZBS-NEXT: bnez a3, .LBB24_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 ; RV32ZBS-NEXT: mv a1, a5 @@ -814,10 +814,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32XTHEADBS-LABEL: bit_10_nz_select_i64: ; RV32XTHEADBS: # %bb.0: -; RV32XTHEADBS-NEXT: th.tst a6, a0, 10 ; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: th.tst a3, a0, 10 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a6, .LBB24_2 +; RV32XTHEADBS-NEXT: bnez a3, .LBB24_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 @@ -832,10 +832,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 20 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 20 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bgez a6, .LBB25_2 +; RV32-NEXT: bgez a3, .LBB25_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -860,11 +860,11 @@ define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I-LABEL: bit_11_nz_select_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a0, a0, 20 -; RV32I-NEXT: srli a6, a0, 31 ; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: slli a0, a0, 20 +; RV32I-NEXT: srli a3, a0, 31 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB26_2 +; RV32I-NEXT: bnez a3, .LBB26_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 @@ -883,10 +883,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32ZBS-LABEL: bit_11_nz_select_i64: ; RV32ZBS: # %bb.0: -; RV32ZBS-NEXT: bexti a6, a0, 11 ; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: bexti a3, a0, 11 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a6, .LBB26_2 +; RV32ZBS-NEXT: bnez a3, .LBB26_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 ; RV32ZBS-NEXT: mv a1, a5 @@ -895,10 +895,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32XTHEADBS-LABEL: bit_11_nz_select_i64: ; RV32XTHEADBS: # %bb.0: -; RV32XTHEADBS-NEXT: th.tst a6, a0, 11 ; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: th.tst a3, a0, 11 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a6, .LBB26_2 +; RV32XTHEADBS-NEXT: bnez a3, .LBB26_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 @@ -913,10 +913,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 11 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 11 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bgez a6, .LBB27_2 +; RV32-NEXT: bgez a3, .LBB27_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -941,11 +941,11 @@ define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I-LABEL: bit_20_nz_select_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a0, a0, 11 -; RV32I-NEXT: srli a6, a0, 31 ; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: slli a0, a0, 11 +; RV32I-NEXT: srli a3, a0, 31 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB28_2 +; RV32I-NEXT: bnez a3, .LBB28_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 @@ -964,10 +964,10 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32ZBS-LABEL: bit_20_nz_select_i64: ; RV32ZBS: # %bb.0: -; RV32ZBS-NEXT: bexti a6, a0, 20 ; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: bexti a3, a0, 20 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a6, .LBB28_2 +; RV32ZBS-NEXT: bnez a3, .LBB28_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 ; RV32ZBS-NEXT: mv a1, a5 @@ -976,10 +976,10 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32XTHEADBS-LABEL: bit_20_nz_select_i64: ; RV32XTHEADBS: # %bb.0: -; RV32XTHEADBS-NEXT: th.tst a6, a0, 20 ; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: th.tst a3, a0, 20 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a6, .LBB28_2 +; RV32XTHEADBS-NEXT: bnez a3, .LBB28_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 @@ -1021,10 +1021,10 @@ define i64 @bit_31_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: srli a6, a0, 31 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: srli a3, a0, 31 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB30_2 +; RV32-NEXT: bnez a3, .LBB30_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -1049,14 +1049,14 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_32_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a1, 1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB31_2 +; RV32-NEXT: beqz a1, .LBB31_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB31_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_32_z_select_i64: @@ -1077,14 +1077,14 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_32_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a1, 1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB32_2 +; RV32-NEXT: bnez a1, .LBB32_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB32_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_32_nz_select_i64: @@ -1105,14 +1105,14 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_55_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a1, 8 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bgez a6, .LBB33_2 +; RV32-NEXT: bgez a1, .LBB33_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB33_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_55_z_select_i64: @@ -1134,14 +1134,14 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I-LABEL: bit_55_nz_select_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: srli a6, a1, 31 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: srli a1, a1, 31 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB34_2 +; RV32I-NEXT: bnez a1, .LBB34_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB34_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_55_nz_select_i64: @@ -1156,26 +1156,26 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32ZBS-LABEL: bit_55_nz_select_i64: ; RV32ZBS: # %bb.0: -; RV32ZBS-NEXT: bexti a6, a1, 23 -; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: bexti a1, a1, 23 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a6, .LBB34_2 +; RV32ZBS-NEXT: bnez a1, .LBB34_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 -; RV32ZBS-NEXT: mv a1, a5 +; RV32ZBS-NEXT: mv a3, a5 ; RV32ZBS-NEXT: .LBB34_2: +; RV32ZBS-NEXT: mv a1, a3 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_55_nz_select_i64: ; RV32XTHEADBS: # %bb.0: -; RV32XTHEADBS-NEXT: th.tst a6, a1, 23 -; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: th.tst a1, a1, 23 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a6, .LBB34_2 +; RV32XTHEADBS-NEXT: bnez a1, .LBB34_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 -; RV32XTHEADBS-NEXT: mv a1, a5 +; RV32XTHEADBS-NEXT: mv a3, a5 ; RV32XTHEADBS-NEXT: .LBB34_2: +; RV32XTHEADBS-NEXT: mv a1, a3 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 36028797018963968 %2 = icmp ne i64 %1, 0 @@ -1212,14 +1212,14 @@ define i64 @bit_63_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_63_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: srli a6, a1, 31 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB36_2 +; RV32-NEXT: bnez a1, .LBB36_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB36_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_63_nz_select_i64: @@ -2108,10 +2108,10 @@ define signext i32 @bit_32_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a0, 1023 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 1023 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB71_2 +; RV32-NEXT: beqz a3, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2136,10 +2136,10 @@ define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a0, 1023 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 1023 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB72_2 +; RV32-NEXT: bnez a3, .LBB72_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2164,10 +2164,10 @@ define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a0, 2047 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 2047 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB73_2 +; RV32-NEXT: beqz a3, .LBB73_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2192,10 +2192,10 @@ define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a6, a0, 2047 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 2047 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB74_2 +; RV32-NEXT: bnez a3, .LBB74_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2220,10 +2220,10 @@ define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_16_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_16_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 16 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 16 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB75_2 +; RV32-NEXT: beqz a3, .LBB75_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2276,10 +2276,10 @@ define i64 @bit_16_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 12 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 12 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB77_2 +; RV32-NEXT: beqz a3, .LBB77_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2304,10 +2304,10 @@ define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 12 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 12 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB78_2 +; RV32-NEXT: bnez a3, .LBB78_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2332,10 +2332,10 @@ define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 1 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB79_2 +; RV32-NEXT: beqz a3, .LBB79_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2360,10 +2360,10 @@ define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_31_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a6, a0, 1 ; RV32-NEXT: mv a1, a3 +; RV32-NEXT: slli a3, a0, 1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB80_2 +; RV32-NEXT: bnez a3, .LBB80_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 ; RV32-NEXT: mv a1, a5 @@ -2444,14 +2444,14 @@ define i64 @bit_55_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a1, 9 ; RV32-NEXT: srli a1, a1, 9 -; RV32-NEXT: or a6, a0, a1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: or a1, a0, a1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB83_2 +; RV32-NEXT: beqz a1, .LBB83_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB83_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_55_1_z_select_i64: @@ -2474,14 +2474,14 @@ define i64 @bit_55_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a1, 9 ; RV32-NEXT: srli a1, a1, 9 -; RV32-NEXT: or a6, a0, a1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: or a1, a0, a1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB84_2 +; RV32-NEXT: bnez a1, .LBB84_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB84_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_55_1_nz_select_i64: @@ -2504,14 +2504,14 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a6, .LBB85_2 +; RV32I-NEXT: beqz a1, .LBB85_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB85_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_63_1_z_select_i64: @@ -2527,28 +2527,28 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32ZBS-LABEL: bit_63_1_z_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: bclri a1, a1, 31 -; RV32ZBS-NEXT: or a6, a0, a1 -; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: or a1, a0, a1 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: beqz a6, .LBB85_2 +; RV32ZBS-NEXT: beqz a1, .LBB85_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 -; RV32ZBS-NEXT: mv a1, a5 +; RV32ZBS-NEXT: mv a3, a5 ; RV32ZBS-NEXT: .LBB85_2: +; RV32ZBS-NEXT: mv a1, a3 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_63_1_z_select_i64: ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: slli a1, a1, 1 ; RV32XTHEADBS-NEXT: srli a1, a1, 1 -; RV32XTHEADBS-NEXT: or a6, a0, a1 -; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: or a1, a0, a1 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: beqz a6, .LBB85_2 +; RV32XTHEADBS-NEXT: beqz a1, .LBB85_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 -; RV32XTHEADBS-NEXT: mv a1, a5 +; RV32XTHEADBS-NEXT: mv a3, a5 ; RV32XTHEADBS-NEXT: .LBB85_2: +; RV32XTHEADBS-NEXT: mv a1, a3 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 9223372036854775807 %2 = icmp eq i64 %1, 0 @@ -2561,14 +2561,14 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB86_2 +; RV32I-NEXT: bnez a1, .LBB86_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB86_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_63_1_nz_select_i64: @@ -2584,28 +2584,28 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32ZBS-LABEL: bit_63_1_nz_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: bclri a1, a1, 31 -; RV32ZBS-NEXT: or a6, a0, a1 -; RV32ZBS-NEXT: mv a1, a3 +; RV32ZBS-NEXT: or a1, a0, a1 ; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a6, .LBB86_2 +; RV32ZBS-NEXT: bnez a1, .LBB86_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 -; RV32ZBS-NEXT: mv a1, a5 +; RV32ZBS-NEXT: mv a3, a5 ; RV32ZBS-NEXT: .LBB86_2: +; RV32ZBS-NEXT: mv a1, a3 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_63_1_nz_select_i64: ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: slli a1, a1, 1 ; RV32XTHEADBS-NEXT: srli a1, a1, 1 -; RV32XTHEADBS-NEXT: or a6, a0, a1 -; RV32XTHEADBS-NEXT: mv a1, a3 +; RV32XTHEADBS-NEXT: or a1, a0, a1 ; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a6, .LBB86_2 +; RV32XTHEADBS-NEXT: bnez a1, .LBB86_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 -; RV32XTHEADBS-NEXT: mv a1, a5 +; RV32XTHEADBS-NEXT: mv a3, a5 ; RV32XTHEADBS-NEXT: .LBB86_2: +; RV32XTHEADBS-NEXT: mv a1, a3 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 9223372036854775807 %2 = icmp ne i64 %1, 0 @@ -2616,14 +2616,14 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_64_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: or a6, a0, a1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: or a1, a0, a1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a6, .LBB87_2 +; RV32-NEXT: beqz a1, .LBB87_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB87_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_64_1_z_select_i64: @@ -2643,14 +2643,14 @@ define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_64_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_64_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: or a6, a0, a1 -; RV32-NEXT: mv a1, a3 +; RV32-NEXT: or a1, a0, a1 ; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a6, .LBB88_2 +; RV32-NEXT: bnez a1, .LBB88_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 -; RV32-NEXT: mv a1, a5 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB88_2: +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_64_1_nz_select_i64: diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll index 9c69fe0a6e486..40a5772142345 100644 --- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll @@ -57,11 +57,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: addi a2, a2, -256 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 @@ -72,11 +72,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srliw a3, a0, 24 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 @@ -102,53 +102,52 @@ define i64 @test_bswap_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: srli a4, a1, 24 +; RV32I-NEXT: srli a5, a0, 8 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: or a2, a1, a2 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a5, a1, 24 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a0, a3 +; RV32I-NEXT: or a0, a1, a2 +; RV32I-NEXT: or a1, a3, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_bswap_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srli a3, a0, 56 +; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: lui a5, 4080 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: lui a4, 4080 -; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: srli a5, a0, 8 -; RV64I-NEXT: srliw a5, a5, 24 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: and a4, a0, a4 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: srliw a3, a3, 24 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: and a5, a0, a5 ; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a2, a2, 40 +; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -171,18 +170,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; RV32I-LABEL: test_bitreverse_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: andi a1, a0, 15 -; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: srli a0, a0, 28 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: andi a1, a0, 51 -; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: andi a0, a0, 51 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: andi a1, a0, 85 -; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: andi a0, a0, 85 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -190,18 +189,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; RV64I-LABEL: test_bitreverse_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: andi a1, a0, 15 -; RV64I-NEXT: slli a1, a1, 4 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a1, a1, 4 ; RV64I-NEXT: srli a0, a0, 60 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 51 -; RV64I-NEXT: slli a1, a1, 2 ; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: slli a1, a1, 2 ; RV64I-NEXT: andi a0, a0, 51 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 85 -; RV64I-NEXT: slli a1, a1, 1 ; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: slli a1, a1, 1 ; RV64I-NEXT: andi a0, a0, 85 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -209,18 +208,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; RV32ZBB-LABEL: test_bitreverse_i8: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: andi a1, a0, 15 -; RV32ZBB-NEXT: slli a1, a1, 4 ; RV32ZBB-NEXT: slli a0, a0, 24 +; RV32ZBB-NEXT: slli a1, a1, 4 ; RV32ZBB-NEXT: srli a0, a0, 28 ; RV32ZBB-NEXT: or a0, a0, a1 ; RV32ZBB-NEXT: andi a1, a0, 51 -; RV32ZBB-NEXT: slli a1, a1, 2 ; RV32ZBB-NEXT: srli a0, a0, 2 +; RV32ZBB-NEXT: slli a1, a1, 2 ; RV32ZBB-NEXT: andi a0, a0, 51 ; RV32ZBB-NEXT: or a0, a0, a1 ; RV32ZBB-NEXT: andi a1, a0, 85 -; RV32ZBB-NEXT: slli a1, a1, 1 ; RV32ZBB-NEXT: srli a0, a0, 1 +; RV32ZBB-NEXT: slli a1, a1, 1 ; RV32ZBB-NEXT: andi a0, a0, 85 ; RV32ZBB-NEXT: or a0, a0, a1 ; RV32ZBB-NEXT: ret @@ -228,18 +227,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; RV64ZBB-LABEL: test_bitreverse_i8: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: andi a1, a0, 15 -; RV64ZBB-NEXT: slli a1, a1, 4 ; RV64ZBB-NEXT: slli a0, a0, 56 +; RV64ZBB-NEXT: slli a1, a1, 4 ; RV64ZBB-NEXT: srli a0, a0, 60 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: andi a1, a0, 51 -; RV64ZBB-NEXT: slli a1, a1, 2 ; RV64ZBB-NEXT: srli a0, a0, 2 +; RV64ZBB-NEXT: slli a1, a1, 2 ; RV64ZBB-NEXT: andi a0, a0, 51 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: andi a1, a0, 85 -; RV64ZBB-NEXT: slli a1, a1, 1 ; RV64ZBB-NEXT: srli a0, a0, 1 +; RV64ZBB-NEXT: slli a1, a1, 1 ; RV64ZBB-NEXT: andi a0, a0, 85 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: ret @@ -266,27 +265,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: lui a2, 1 ; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: lui a2, 1 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 3 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 5 +; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 5 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -295,27 +294,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: lui a2, 1 ; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: lui a2, 1 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 3 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 3 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 5 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -323,25 +322,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; RV32ZBB-LABEL: test_bitreverse_i16: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: rev8 a0, a0 -; RV32ZBB-NEXT: srli a1, a0, 12 -; RV32ZBB-NEXT: lui a2, 15 -; RV32ZBB-NEXT: addi a2, a2, 240 -; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a1, 15 +; RV32ZBB-NEXT: srli a2, a0, 12 +; RV32ZBB-NEXT: addi a1, a1, 240 +; RV32ZBB-NEXT: and a1, a2, a1 +; RV32ZBB-NEXT: lui a2, 3 ; RV32ZBB-NEXT: srli a0, a0, 20 +; RV32ZBB-NEXT: addi a2, a2, 819 ; RV32ZBB-NEXT: andi a0, a0, -241 ; RV32ZBB-NEXT: or a0, a0, a1 ; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 3 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a2, 5 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 5 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -349,25 +348,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; RV64ZBB-LABEL: test_bitreverse_i16: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: rev8 a0, a0 -; RV64ZBB-NEXT: srli a1, a0, 44 -; RV64ZBB-NEXT: lui a2, 15 -; RV64ZBB-NEXT: addiw a2, a2, 240 -; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a1, 15 +; RV64ZBB-NEXT: srli a2, a0, 44 +; RV64ZBB-NEXT: addiw a1, a1, 240 +; RV64ZBB-NEXT: and a1, a2, a1 +; RV64ZBB-NEXT: lui a2, 3 ; RV64ZBB-NEXT: srli a0, a0, 52 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: andi a0, a0, -241 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 3 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 5 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 5 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -394,34 +393,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: addi a2, a2, -256 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: lui a3, 61681 +; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a3, -241 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -430,34 +429,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srliw a3, a0, 24 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, 1365 ; RV64I-NEXT: slliw a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slliw a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slliw a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -465,25 +464,25 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; RV32ZBB-LABEL: test_bitreverse_i32: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: rev8 a0, a0 -; RV32ZBB-NEXT: srli a1, a0, 4 -; RV32ZBB-NEXT: lui a2, 61681 -; RV32ZBB-NEXT: addi a2, a2, -241 -; RV32ZBB-NEXT: and a1, a1, a2 -; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a1, 61681 +; RV32ZBB-NEXT: srli a2, a0, 4 +; RV32ZBB-NEXT: addi a1, a1, -241 +; RV32ZBB-NEXT: and a2, a2, a1 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: lui a1, 209715 +; RV32ZBB-NEXT: addi a1, a1, 819 ; RV32ZBB-NEXT: slli a0, a0, 4 -; RV32ZBB-NEXT: or a0, a1, a0 -; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 209715 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 -; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: srli a2, a0, 2 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: and a1, a2, a1 +; RV32ZBB-NEXT: lui a2, 349525 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 349525 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -491,28 +490,28 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; RV64ZBB-LABEL: test_bitreverse_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: rev8 a0, a0 -; RV64ZBB-NEXT: srli a1, a0, 36 -; RV64ZBB-NEXT: lui a2, 61681 -; RV64ZBB-NEXT: addiw a2, a2, -241 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: srli a0, a0, 28 +; RV64ZBB-NEXT: lui a1, 61681 +; RV64ZBB-NEXT: srli a2, a0, 36 +; RV64ZBB-NEXT: addiw a1, a1, -241 +; RV64ZBB-NEXT: and a1, a2, a1 ; RV64ZBB-NEXT: lui a2, 986895 +; RV64ZBB-NEXT: srli a0, a0, 28 ; RV64ZBB-NEXT: addi a2, a2, 240 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: sext.w a0, a0 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 209715 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 349525 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slliw a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slliw a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -538,115 +537,114 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: srli a4, a1, 24 +; RV32I-NEXT: slli a5, a1, 24 +; RV32I-NEXT: lui a6, 61681 +; RV32I-NEXT: srli a7, a0, 8 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: and a7, a7, a3 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: lui a7, 209715 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: lui a5, 349525 +; RV32I-NEXT: and a3, a0, a3 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: addi a6, a6, -241 +; RV32I-NEXT: addi a7, a7, 819 +; RV32I-NEXT: addi a5, a5, 1365 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a4, a4, -241 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a1, a1, a6 +; RV32I-NEXT: srli a3, a0, 4 +; RV32I-NEXT: and a0, a0, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: slli a1, a1, 4 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: srli a2, a1, 2 -; RV32I-NEXT: lui a5, 209715 -; RV32I-NEXT: addi a5, a5, 819 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a1, a1, a7 +; RV32I-NEXT: srli a3, a0, 2 +; RV32I-NEXT: and a0, a0, a7 +; RV32I-NEXT: and a2, a2, a7 ; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: and a3, a3, a7 +; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: lui a6, 349525 -; RV32I-NEXT: addi a6, a6, 1365 -; RV32I-NEXT: and a2, a2, a6 -; RV32I-NEXT: and a1, a1, a6 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: or a2, a2, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: srli a7, a0, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: slli a0, a0, 4 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: srli a3, a0, 1 ; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: slli a0, a0, 2 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, a6 -; RV32I-NEXT: and a0, a0, a6 -; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: or a1, a1, a0 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: and a2, a2, a5 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a3, a3, a5 +; RV32I-NEXT: slli a4, a0, 1 +; RV32I-NEXT: or a0, a2, a1 +; RV32I-NEXT: or a1, a3, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_bitreverse_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srli a3, a0, 56 +; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: lui a5, 4080 +; RV64I-NEXT: srli a6, a0, 8 +; RV64I-NEXT: srliw a7, a0, 24 +; RV64I-NEXT: lui t0, 61681 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: lui a4, 4080 -; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: srli a5, a0, 8 -; RV64I-NEXT: srliw a5, a5, 24 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: srliw a6, a6, 24 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lui a6, 349525 +; RV64I-NEXT: and a5, a0, a5 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: addiw t0, t0, -241 +; RV64I-NEXT: addiw a3, a3, 819 +; RV64I-NEXT: addiw a6, a6, 1365 ; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: and a4, a0, a4 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a5, a7 +; RV64I-NEXT: slli a7, t0, 32 +; RV64I-NEXT: add a7, t0, a7 +; RV64I-NEXT: slli t0, a3, 32 +; RV64I-NEXT: add a3, a3, t0 +; RV64I-NEXT: slli t0, a6, 32 +; RV64I-NEXT: add a6, a6, t0 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a7 +; RV64I-NEXT: and a1, a1, a7 ; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a6 +; RV64I-NEXT: and a1, a1, a6 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -654,74 +652,73 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; RV32ZBB-LABEL: test_bitreverse_i64: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: rev8 a1, a1 -; RV32ZBB-NEXT: srli a2, a1, 4 -; RV32ZBB-NEXT: lui a3, 61681 -; RV32ZBB-NEXT: addi a3, a3, -241 -; RV32ZBB-NEXT: and a2, a2, a3 -; RV32ZBB-NEXT: and a1, a1, a3 -; RV32ZBB-NEXT: slli a1, a1, 4 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 2 -; RV32ZBB-NEXT: lui a4, 209715 -; RV32ZBB-NEXT: addi a4, a4, 819 -; RV32ZBB-NEXT: and a2, a2, a4 -; RV32ZBB-NEXT: and a1, a1, a4 -; RV32ZBB-NEXT: slli a1, a1, 2 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 1 -; RV32ZBB-NEXT: lui a5, 349525 -; RV32ZBB-NEXT: addi a5, a5, 1365 -; RV32ZBB-NEXT: and a2, a2, a5 -; RV32ZBB-NEXT: and a1, a1, a5 -; RV32ZBB-NEXT: slli a1, a1, 1 -; RV32ZBB-NEXT: or a2, a2, a1 +; RV32ZBB-NEXT: lui a2, 61681 +; RV32ZBB-NEXT: lui a3, 209715 ; RV32ZBB-NEXT: rev8 a0, a0 -; RV32ZBB-NEXT: srli a1, a0, 4 +; RV32ZBB-NEXT: srli a4, a1, 4 +; RV32ZBB-NEXT: addi a2, a2, -241 +; RV32ZBB-NEXT: srli a5, a0, 4 +; RV32ZBB-NEXT: and a4, a4, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: and a5, a5, a2 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a2, 349525 +; RV32ZBB-NEXT: addi a3, a3, 819 +; RV32ZBB-NEXT: addi a2, a2, 1365 +; RV32ZBB-NEXT: slli a1, a1, 4 +; RV32ZBB-NEXT: slli a0, a0, 4 +; RV32ZBB-NEXT: or a1, a4, a1 +; RV32ZBB-NEXT: or a0, a5, a0 +; RV32ZBB-NEXT: srli a4, a1, 2 ; RV32ZBB-NEXT: and a1, a1, a3 +; RV32ZBB-NEXT: srli a5, a0, 2 ; RV32ZBB-NEXT: and a0, a0, a3 -; RV32ZBB-NEXT: slli a0, a0, 4 -; RV32ZBB-NEXT: or a0, a1, a0 -; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: and a1, a1, a4 -; RV32ZBB-NEXT: and a0, a0, a4 +; RV32ZBB-NEXT: and a4, a4, a3 +; RV32ZBB-NEXT: slli a1, a1, 2 +; RV32ZBB-NEXT: and a3, a5, a3 ; RV32ZBB-NEXT: slli a0, a0, 2 -; RV32ZBB-NEXT: or a0, a1, a0 -; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: and a1, a1, a5 -; RV32ZBB-NEXT: and a0, a0, a5 -; RV32ZBB-NEXT: slli a0, a0, 1 -; RV32ZBB-NEXT: or a1, a1, a0 -; RV32ZBB-NEXT: mv a0, a2 +; RV32ZBB-NEXT: or a1, a4, a1 +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: srli a3, a1, 1 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: srli a4, a0, 1 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a3, a3, a2 +; RV32ZBB-NEXT: slli a1, a1, 1 +; RV32ZBB-NEXT: and a2, a4, a2 +; RV32ZBB-NEXT: slli a4, a0, 1 +; RV32ZBB-NEXT: or a0, a3, a1 +; RV32ZBB-NEXT: or a1, a2, a4 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: test_bitreverse_i64: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: rev8 a0, a0 -; RV64ZBB-NEXT: srli a1, a0, 4 -; RV64ZBB-NEXT: lui a2, 61681 -; RV64ZBB-NEXT: addiw a2, a2, -241 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 -; RV64ZBB-NEXT: slli a0, a0, 4 -; RV64ZBB-NEXT: or a0, a1, a0 -; RV64ZBB-NEXT: srli a1, a0, 2 +; RV64ZBB-NEXT: lui a1, 61681 ; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: lui a3, 349525 +; RV64ZBB-NEXT: addiw a1, a1, -241 ; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: addiw a3, a3, 1365 +; RV64ZBB-NEXT: slli a4, a1, 32 +; RV64ZBB-NEXT: add a1, a1, a4 +; RV64ZBB-NEXT: slli a4, a2, 32 +; RV64ZBB-NEXT: add a2, a2, a4 +; RV64ZBB-NEXT: slli a4, a3, 32 +; RV64ZBB-NEXT: add a3, a3, a4 +; RV64ZBB-NEXT: srli a4, a0, 4 +; RV64ZBB-NEXT: and a4, a4, a1 +; RV64ZBB-NEXT: and a0, a0, a1 +; RV64ZBB-NEXT: slli a0, a0, 4 +; RV64ZBB-NEXT: or a0, a4, a0 +; RV64ZBB-NEXT: srli a1, a0, 2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a0, a0, a3 +; RV64ZBB-NEXT: and a1, a1, a3 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -729,10 +726,9 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; RV32ZBKB-LABEL: test_bitreverse_i64: ; RV32ZBKB: # %bb.0: ; RV32ZBKB-NEXT: rev8 a1, a1 -; RV32ZBKB-NEXT: brev8 a2, a1 -; RV32ZBKB-NEXT: rev8 a0, a0 -; RV32ZBKB-NEXT: brev8 a1, a0 -; RV32ZBKB-NEXT: mv a0, a2 +; RV32ZBKB-NEXT: rev8 a2, a0 +; RV32ZBKB-NEXT: brev8 a0, a1 +; RV32ZBKB-NEXT: brev8 a1, a2 ; RV32ZBKB-NEXT: ret ; ; RV64ZBKB-LABEL: test_bitreverse_i64: @@ -752,20 +748,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind { ; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 3 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 5 +; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 5 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -777,20 +773,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind { ; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 3 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 3 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 5 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -802,20 +798,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind { ; RV32ZBB-NEXT: addi a2, a2, -241 ; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a2, 3 +; RV32ZBB-NEXT: addi a2, a2, 819 ; RV32ZBB-NEXT: slli a0, a0, 4 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 3 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a2, 5 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 5 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -827,20 +823,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind { ; RV64ZBB-NEXT: addiw a2, a2, -241 ; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: lui a2, 3 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: slli a0, a0, 4 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 3 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 5 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 5 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -867,20 +863,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind { ; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -892,20 +888,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind { ; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slliw a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 349525 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slliw a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slliw a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -917,20 +913,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind { ; RV32ZBB-NEXT: addi a2, a2, -241 ; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a2, 209715 +; RV32ZBB-NEXT: addi a2, a2, 819 ; RV32ZBB-NEXT: slli a0, a0, 4 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 209715 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a2, 349525 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 349525 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -942,20 +938,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind { ; RV64ZBB-NEXT: addiw a2, a2, -241 ; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: slliw a0, a0, 4 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 209715 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 349525 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slliw a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slliw a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -979,69 +975,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: lui a3, 61681 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 4 ; RV32I-NEXT: addi a3, a3, -241 ; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: srli a5, a1, 2 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: and a4, a5, a4 +; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a5, a5, 1365 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: srli a4, a1, 1 ; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: srli a2, a1, 2 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a3, a4, a3 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_bswap_bitreverse_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a0, a0, 4 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: srli a4, a0, 4 +; RV64I-NEXT: and a4, a4, a1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -1050,69 +1046,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: srli a2, a0, 4 ; RV32ZBB-NEXT: lui a3, 61681 +; RV32ZBB-NEXT: lui a4, 209715 +; RV32ZBB-NEXT: srli a5, a1, 4 ; RV32ZBB-NEXT: addi a3, a3, -241 ; RV32ZBB-NEXT: and a2, a2, a3 ; RV32ZBB-NEXT: and a0, a0, a3 +; RV32ZBB-NEXT: and a5, a5, a3 +; RV32ZBB-NEXT: and a1, a1, a3 +; RV32ZBB-NEXT: lui a3, 349525 +; RV32ZBB-NEXT: addi a4, a4, 819 +; RV32ZBB-NEXT: addi a3, a3, 1365 ; RV32ZBB-NEXT: slli a0, a0, 4 +; RV32ZBB-NEXT: slli a1, a1, 4 ; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a5, a1 ; RV32ZBB-NEXT: srli a2, a0, 2 -; RV32ZBB-NEXT: lui a4, 209715 -; RV32ZBB-NEXT: addi a4, a4, 819 -; RV32ZBB-NEXT: and a2, a2, a4 ; RV32ZBB-NEXT: and a0, a0, a4 +; RV32ZBB-NEXT: srli a5, a1, 2 +; RV32ZBB-NEXT: and a1, a1, a4 +; RV32ZBB-NEXT: and a2, a2, a4 ; RV32ZBB-NEXT: slli a0, a0, 2 +; RV32ZBB-NEXT: and a4, a5, a4 +; RV32ZBB-NEXT: slli a1, a1, 2 ; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a4, a1 ; RV32ZBB-NEXT: srli a2, a0, 1 -; RV32ZBB-NEXT: lui a5, 349525 -; RV32ZBB-NEXT: addi a5, a5, 1365 -; RV32ZBB-NEXT: and a2, a2, a5 -; RV32ZBB-NEXT: and a0, a0, a5 -; RV32ZBB-NEXT: slli a0, a0, 1 -; RV32ZBB-NEXT: or a0, a2, a0 -; RV32ZBB-NEXT: srli a2, a1, 4 -; RV32ZBB-NEXT: and a2, a2, a3 +; RV32ZBB-NEXT: and a0, a0, a3 +; RV32ZBB-NEXT: srli a4, a1, 1 ; RV32ZBB-NEXT: and a1, a1, a3 -; RV32ZBB-NEXT: slli a1, a1, 4 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 2 -; RV32ZBB-NEXT: and a2, a2, a4 -; RV32ZBB-NEXT: and a1, a1, a4 -; RV32ZBB-NEXT: slli a1, a1, 2 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 1 -; RV32ZBB-NEXT: and a2, a2, a5 -; RV32ZBB-NEXT: and a1, a1, a5 +; RV32ZBB-NEXT: and a2, a2, a3 +; RV32ZBB-NEXT: slli a0, a0, 1 +; RV32ZBB-NEXT: and a3, a4, a3 ; RV32ZBB-NEXT: slli a1, a1, 1 -; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a3, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: test_bswap_bitreverse_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: srli a1, a0, 4 -; RV64ZBB-NEXT: lui a2, 61681 -; RV64ZBB-NEXT: addiw a2, a2, -241 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 -; RV64ZBB-NEXT: slli a0, a0, 4 -; RV64ZBB-NEXT: or a0, a1, a0 -; RV64ZBB-NEXT: srli a1, a0, 2 +; RV64ZBB-NEXT: lui a1, 61681 ; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: lui a3, 349525 +; RV64ZBB-NEXT: addiw a1, a1, -241 ; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: addiw a3, a3, 1365 +; RV64ZBB-NEXT: slli a4, a1, 32 +; RV64ZBB-NEXT: add a1, a1, a4 +; RV64ZBB-NEXT: slli a4, a2, 32 +; RV64ZBB-NEXT: add a2, a2, a4 +; RV64ZBB-NEXT: slli a4, a3, 32 +; RV64ZBB-NEXT: add a3, a3, a4 +; RV64ZBB-NEXT: srli a4, a0, 4 +; RV64ZBB-NEXT: and a4, a4, a1 +; RV64ZBB-NEXT: and a0, a0, a1 +; RV64ZBB-NEXT: slli a0, a0, 4 +; RV64ZBB-NEXT: or a0, a4, a0 +; RV64ZBB-NEXT: srli a1, a0, 2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a0, a0, a3 +; RV64ZBB-NEXT: and a1, a1, a3 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -1140,20 +1136,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind { ; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 3 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 5 +; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 5 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -1165,20 +1161,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind { ; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 3 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slli a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 3 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 5 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -1190,20 +1186,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind { ; RV32ZBB-NEXT: addi a2, a2, -241 ; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a2, 3 +; RV32ZBB-NEXT: addi a2, a2, 819 ; RV32ZBB-NEXT: slli a0, a0, 4 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 3 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a2, 5 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 5 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -1215,20 +1211,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind { ; RV64ZBB-NEXT: addiw a2, a2, -241 ; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: lui a2, 3 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: slli a0, a0, 4 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 3 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 5 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 5 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -1255,20 +1251,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind { ; RV32I-NEXT: addi a2, a2, -241 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: slli a0, a0, 4 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret @@ -1280,20 +1276,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind { ; RV64I-NEXT: addiw a2, a2, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slliw a0, a0, 4 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 349525 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slliw a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slliw a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -1305,20 +1301,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind { ; RV32ZBB-NEXT: addi a2, a2, -241 ; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: lui a2, 209715 +; RV32ZBB-NEXT: addi a2, a2, 819 ; RV32ZBB-NEXT: slli a0, a0, 4 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 2 -; RV32ZBB-NEXT: lui a2, 209715 -; RV32ZBB-NEXT: addi a2, a2, 819 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 +; RV32ZBB-NEXT: lui a2, 349525 +; RV32ZBB-NEXT: addi a2, a2, 1365 ; RV32ZBB-NEXT: slli a0, a0, 2 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: srli a1, a0, 1 -; RV32ZBB-NEXT: lui a2, 349525 -; RV32ZBB-NEXT: addi a2, a2, 1365 -; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: and a1, a1, a2 ; RV32ZBB-NEXT: slli a0, a0, 1 ; RV32ZBB-NEXT: or a0, a1, a0 ; RV32ZBB-NEXT: ret @@ -1330,20 +1326,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind { ; RV64ZBB-NEXT: addiw a2, a2, -241 ; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: addiw a2, a2, 819 ; RV64ZBB-NEXT: slliw a0, a0, 4 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 2 -; RV64ZBB-NEXT: lui a2, 209715 -; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: lui a2, 349525 +; RV64ZBB-NEXT: addiw a2, a2, 1365 ; RV64ZBB-NEXT: slliw a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slliw a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret @@ -1367,69 +1363,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 4 ; RV32I-NEXT: lui a3, 61681 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 4 ; RV32I-NEXT: addi a3, a3, -241 ; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: slli a1, a1, 4 ; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: srli a2, a0, 2 -; RV32I-NEXT: lui a4, 209715 -; RV32I-NEXT: addi a4, a4, 819 -; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: srli a5, a1, 2 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: and a4, a5, a4 +; RV32I-NEXT: slli a1, a1, 2 ; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: srli a2, a0, 1 -; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a5, a5, 1365 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: srli a4, a1, 1 ; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: srli a2, a1, 2 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: srli a2, a1, 1 -; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a3, a4, a3 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_bitreverse_bswap_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: slli a0, a0, 4 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: srli a4, a0, 4 +; RV64I-NEXT: and a4, a4, a1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret @@ -1438,69 +1434,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: srli a2, a0, 4 ; RV32ZBB-NEXT: lui a3, 61681 +; RV32ZBB-NEXT: lui a4, 209715 +; RV32ZBB-NEXT: srli a5, a1, 4 ; RV32ZBB-NEXT: addi a3, a3, -241 ; RV32ZBB-NEXT: and a2, a2, a3 ; RV32ZBB-NEXT: and a0, a0, a3 +; RV32ZBB-NEXT: and a5, a5, a3 +; RV32ZBB-NEXT: and a1, a1, a3 +; RV32ZBB-NEXT: lui a3, 349525 +; RV32ZBB-NEXT: addi a4, a4, 819 +; RV32ZBB-NEXT: addi a3, a3, 1365 ; RV32ZBB-NEXT: slli a0, a0, 4 +; RV32ZBB-NEXT: slli a1, a1, 4 ; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a5, a1 ; RV32ZBB-NEXT: srli a2, a0, 2 -; RV32ZBB-NEXT: lui a4, 209715 -; RV32ZBB-NEXT: addi a4, a4, 819 -; RV32ZBB-NEXT: and a2, a2, a4 ; RV32ZBB-NEXT: and a0, a0, a4 +; RV32ZBB-NEXT: srli a5, a1, 2 +; RV32ZBB-NEXT: and a1, a1, a4 +; RV32ZBB-NEXT: and a2, a2, a4 ; RV32ZBB-NEXT: slli a0, a0, 2 +; RV32ZBB-NEXT: and a4, a5, a4 +; RV32ZBB-NEXT: slli a1, a1, 2 ; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a4, a1 ; RV32ZBB-NEXT: srli a2, a0, 1 -; RV32ZBB-NEXT: lui a5, 349525 -; RV32ZBB-NEXT: addi a5, a5, 1365 -; RV32ZBB-NEXT: and a2, a2, a5 -; RV32ZBB-NEXT: and a0, a0, a5 -; RV32ZBB-NEXT: slli a0, a0, 1 -; RV32ZBB-NEXT: or a0, a2, a0 -; RV32ZBB-NEXT: srli a2, a1, 4 -; RV32ZBB-NEXT: and a2, a2, a3 +; RV32ZBB-NEXT: and a0, a0, a3 +; RV32ZBB-NEXT: srli a4, a1, 1 ; RV32ZBB-NEXT: and a1, a1, a3 -; RV32ZBB-NEXT: slli a1, a1, 4 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 2 -; RV32ZBB-NEXT: and a2, a2, a4 -; RV32ZBB-NEXT: and a1, a1, a4 -; RV32ZBB-NEXT: slli a1, a1, 2 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: srli a2, a1, 1 -; RV32ZBB-NEXT: and a2, a2, a5 -; RV32ZBB-NEXT: and a1, a1, a5 +; RV32ZBB-NEXT: and a2, a2, a3 +; RV32ZBB-NEXT: slli a0, a0, 1 +; RV32ZBB-NEXT: and a3, a4, a3 ; RV32ZBB-NEXT: slli a1, a1, 1 -; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: or a1, a3, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: test_bitreverse_bswap_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: srli a1, a0, 4 -; RV64ZBB-NEXT: lui a2, 61681 -; RV64ZBB-NEXT: addiw a2, a2, -241 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 -; RV64ZBB-NEXT: slli a0, a0, 4 -; RV64ZBB-NEXT: or a0, a1, a0 -; RV64ZBB-NEXT: srli a1, a0, 2 +; RV64ZBB-NEXT: lui a1, 61681 ; RV64ZBB-NEXT: lui a2, 209715 +; RV64ZBB-NEXT: lui a3, 349525 +; RV64ZBB-NEXT: addiw a1, a1, -241 ; RV64ZBB-NEXT: addiw a2, a2, 819 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 +; RV64ZBB-NEXT: addiw a3, a3, 1365 +; RV64ZBB-NEXT: slli a4, a1, 32 +; RV64ZBB-NEXT: add a1, a1, a4 +; RV64ZBB-NEXT: slli a4, a2, 32 +; RV64ZBB-NEXT: add a2, a2, a4 +; RV64ZBB-NEXT: slli a4, a3, 32 +; RV64ZBB-NEXT: add a3, a3, a4 +; RV64ZBB-NEXT: srli a4, a0, 4 +; RV64ZBB-NEXT: and a4, a4, a1 +; RV64ZBB-NEXT: and a0, a0, a1 +; RV64ZBB-NEXT: slli a0, a0, 4 +; RV64ZBB-NEXT: or a0, a4, a0 +; RV64ZBB-NEXT: srli a1, a0, 2 ; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a1, a1, a2 ; RV64ZBB-NEXT: slli a0, a0, 2 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: srli a1, a0, 1 -; RV64ZBB-NEXT: lui a2, 349525 -; RV64ZBB-NEXT: addiw a2, a2, 1365 -; RV64ZBB-NEXT: slli a3, a2, 32 -; RV64ZBB-NEXT: add a2, a2, a3 -; RV64ZBB-NEXT: and a1, a1, a2 -; RV64ZBB-NEXT: and a0, a0, a2 +; RV64ZBB-NEXT: and a0, a0, a3 +; RV64ZBB-NEXT: and a1, a1, a3 ; RV64ZBB-NEXT: slli a0, a0, 1 ; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll index cccb69d2e6986..541c9b4d40c7e 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll @@ -333,8 +333,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a0, 5 -; RV32I-NEXT: addi t0, a0, -1792 +; RV32I-NEXT: lui a7, 5 ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: li a1, 2 ; RV32I-NEXT: li a2, 3 @@ -342,6 +341,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 6 ; RV32I-NEXT: li a6, 7 +; RV32I-NEXT: addi t0, a7, -1792 ; RV32I-NEXT: li a7, 8 ; RV32I-NEXT: sw t0, 0(sp) ; RV32I-NEXT: call callee_half_on_stack @@ -353,8 +353,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a0, 5 -; RV64I-NEXT: addiw t0, a0, -1792 +; RV64I-NEXT: lui a7, 5 ; RV64I-NEXT: li a0, 1 ; RV64I-NEXT: li a1, 2 ; RV64I-NEXT: li a2, 3 @@ -362,6 +361,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: li a5, 6 ; RV64I-NEXT: li a6, 7 +; RV64I-NEXT: addiw t0, a7, -1792 ; RV64I-NEXT: li a7, 8 ; RV64I-NEXT: sd t0, 0(sp) ; RV64I-NEXT: call callee_half_on_stack @@ -373,8 +373,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV32IF: # %bb.0: ; RV32IF-NEXT: addi sp, sp, -16 ; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IF-NEXT: lui a0, 1048565 -; RV32IF-NEXT: addi t0, a0, -1792 +; RV32IF-NEXT: lui a7, 1048565 ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: li a1, 2 ; RV32IF-NEXT: li a2, 3 @@ -382,6 +381,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV32IF-NEXT: li a4, 5 ; RV32IF-NEXT: li a5, 6 ; RV32IF-NEXT: li a6, 7 +; RV32IF-NEXT: addi t0, a7, -1792 ; RV32IF-NEXT: li a7, 8 ; RV32IF-NEXT: sw t0, 0(sp) ; RV32IF-NEXT: call callee_half_on_stack @@ -393,8 +393,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV64IF: # %bb.0: ; RV64IF-NEXT: addi sp, sp, -16 ; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64IF-NEXT: lui a0, 1048565 -; RV64IF-NEXT: addi t0, a0, -1792 +; RV64IF-NEXT: lui a7, 1048565 ; RV64IF-NEXT: li a0, 1 ; RV64IF-NEXT: li a1, 2 ; RV64IF-NEXT: li a2, 3 @@ -402,6 +401,7 @@ define i32 @caller_half_on_stack() nounwind { ; RV64IF-NEXT: li a4, 5 ; RV64IF-NEXT: li a5, 6 ; RV64IF-NEXT: li a6, 7 +; RV64IF-NEXT: addi t0, a7, -1792 ; RV64IF-NEXT: li a7, 8 ; RV64IF-NEXT: sw t0, 0(sp) ; RV64IF-NEXT: call callee_half_on_stack @@ -413,12 +413,12 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ILP32F: # %bb.0: ; RV32-ILP32F-NEXT: addi sp, sp, -16 ; RV32-ILP32F-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32F-NEXT: lui a0, %hi(.LCPI3_0) -; RV32-ILP32F-NEXT: flw fa0, %lo(.LCPI3_0)(a0) +; RV32-ILP32F-NEXT: lui a4, %hi(.LCPI3_0) ; RV32-ILP32F-NEXT: li a0, 1 ; RV32-ILP32F-NEXT: li a1, 2 ; RV32-ILP32F-NEXT: li a2, 3 ; RV32-ILP32F-NEXT: li a3, 4 +; RV32-ILP32F-NEXT: flw fa0, %lo(.LCPI3_0)(a4) ; RV32-ILP32F-NEXT: li a4, 5 ; RV32-ILP32F-NEXT: li a5, 6 ; RV32-ILP32F-NEXT: li a6, 7 @@ -432,12 +432,12 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-LP64F: # %bb.0: ; RV64-LP64F-NEXT: addi sp, sp, -16 ; RV64-LP64F-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-LP64F-NEXT: lui a0, %hi(.LCPI3_0) -; RV64-LP64F-NEXT: flw fa0, %lo(.LCPI3_0)(a0) +; RV64-LP64F-NEXT: lui a4, %hi(.LCPI3_0) ; RV64-LP64F-NEXT: li a0, 1 ; RV64-LP64F-NEXT: li a1, 2 ; RV64-LP64F-NEXT: li a2, 3 ; RV64-LP64F-NEXT: li a3, 4 +; RV64-LP64F-NEXT: flw fa0, %lo(.LCPI3_0)(a4) ; RV64-LP64F-NEXT: li a4, 5 ; RV64-LP64F-NEXT: li a5, 6 ; RV64-LP64F-NEXT: li a6, 7 @@ -451,12 +451,12 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ILP32ZFHMIN: # %bb.0: ; RV32-ILP32ZFHMIN-NEXT: addi sp, sp, -16 ; RV32-ILP32ZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32ZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; RV32-ILP32ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a0) +; RV32-ILP32ZFHMIN-NEXT: lui a4, %hi(.LCPI3_0) ; RV32-ILP32ZFHMIN-NEXT: li a0, 1 ; RV32-ILP32ZFHMIN-NEXT: li a1, 2 ; RV32-ILP32ZFHMIN-NEXT: li a2, 3 ; RV32-ILP32ZFHMIN-NEXT: li a3, 4 +; RV32-ILP32ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV32-ILP32ZFHMIN-NEXT: li a4, 5 ; RV32-ILP32ZFHMIN-NEXT: li a5, 6 ; RV32-ILP32ZFHMIN-NEXT: li a6, 7 @@ -470,12 +470,12 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-LP64ZFHMIN: # %bb.0: ; RV64-LP64ZFHMIN-NEXT: addi sp, sp, -16 ; RV64-LP64ZFHMIN-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-LP64ZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; RV64-LP64ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a0) +; RV64-LP64ZFHMIN-NEXT: lui a4, %hi(.LCPI3_0) ; RV64-LP64ZFHMIN-NEXT: li a0, 1 ; RV64-LP64ZFHMIN-NEXT: li a1, 2 ; RV64-LP64ZFHMIN-NEXT: li a2, 3 ; RV64-LP64ZFHMIN-NEXT: li a3, 4 +; RV64-LP64ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV64-LP64ZFHMIN-NEXT: li a4, 5 ; RV64-LP64ZFHMIN-NEXT: li a5, 6 ; RV64-LP64ZFHMIN-NEXT: li a6, 7 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll index e97a3bff32fac..9387b7ef4c32e 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll @@ -145,43 +145,45 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: li a0, 18 -; RV32I-FPELIM-NEXT: li a1, 17 -; RV32I-FPELIM-NEXT: sw a1, 20(sp) -; RV32I-FPELIM-NEXT: sw a0, 24(sp) -; RV32I-FPELIM-NEXT: li a0, 16 -; RV32I-FPELIM-NEXT: lui a1, 262236 -; RV32I-FPELIM-NEXT: addi a1, a1, 655 -; RV32I-FPELIM-NEXT: lui a2, 377487 -; RV32I-FPELIM-NEXT: addi a2, a2, 1475 -; RV32I-FPELIM-NEXT: li a3, 15 -; RV32I-FPELIM-NEXT: sw a3, 0(sp) -; RV32I-FPELIM-NEXT: sw a2, 8(sp) -; RV32I-FPELIM-NEXT: sw a1, 12(sp) -; RV32I-FPELIM-NEXT: sw a0, 16(sp) -; RV32I-FPELIM-NEXT: lui a0, 262153 -; RV32I-FPELIM-NEXT: addi t0, a0, 491 -; RV32I-FPELIM-NEXT: lui a0, 545260 -; RV32I-FPELIM-NEXT: addi t1, a0, -1967 -; RV32I-FPELIM-NEXT: lui a0, 964690 -; RV32I-FPELIM-NEXT: addi t2, a0, -328 -; RV32I-FPELIM-NEXT: lui a0, 335544 -; RV32I-FPELIM-NEXT: addi t3, a0, 1311 -; RV32I-FPELIM-NEXT: lui a0, 688509 -; RV32I-FPELIM-NEXT: addi a5, a0, -2048 +; RV32I-FPELIM-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32I-FPELIM-NEXT: li a5, 18 +; RV32I-FPELIM-NEXT: li a6, 17 +; RV32I-FPELIM-NEXT: li a7, 16 +; RV32I-FPELIM-NEXT: lui t0, 262236 +; RV32I-FPELIM-NEXT: lui t1, 377487 +; RV32I-FPELIM-NEXT: li t2, 15 +; RV32I-FPELIM-NEXT: lui t3, 262153 +; RV32I-FPELIM-NEXT: lui t4, 545260 +; RV32I-FPELIM-NEXT: lui t5, 964690 +; RV32I-FPELIM-NEXT: lui t6, 335544 +; RV32I-FPELIM-NEXT: lui s0, 688509 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 11 ; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: li a3, 12 ; RV32I-FPELIM-NEXT: li a4, 13 +; RV32I-FPELIM-NEXT: sw a6, 20(sp) +; RV32I-FPELIM-NEXT: sw a5, 24(sp) ; RV32I-FPELIM-NEXT: li a6, 4 +; RV32I-FPELIM-NEXT: addi a5, t0, 655 +; RV32I-FPELIM-NEXT: addi t0, t1, 1475 +; RV32I-FPELIM-NEXT: sw t2, 0(sp) +; RV32I-FPELIM-NEXT: sw t0, 8(sp) +; RV32I-FPELIM-NEXT: sw a5, 12(sp) +; RV32I-FPELIM-NEXT: sw a7, 16(sp) ; RV32I-FPELIM-NEXT: li a7, 14 +; RV32I-FPELIM-NEXT: addi t0, t3, 491 +; RV32I-FPELIM-NEXT: addi t1, t4, -1967 +; RV32I-FPELIM-NEXT: addi t2, t5, -328 +; RV32I-FPELIM-NEXT: addi t3, t6, 1311 +; RV32I-FPELIM-NEXT: addi a5, s0, -2048 ; RV32I-FPELIM-NEXT: sw t3, 32(sp) ; RV32I-FPELIM-NEXT: sw t2, 36(sp) ; RV32I-FPELIM-NEXT: sw t1, 40(sp) ; RV32I-FPELIM-NEXT: sw t0, 44(sp) ; RV32I-FPELIM-NEXT: call callee_aligned_stack ; RV32I-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32I-FPELIM-NEXT: lw s0, 56(sp) # 4-byte Folded Reload ; RV32I-FPELIM-NEXT: addi sp, sp, 64 ; RV32I-FPELIM-NEXT: ret ; @@ -190,38 +192,39 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: addi sp, sp, -64 ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32I-WITHFP-NEXT: sw s1, 52(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: li a0, 18 -; RV32I-WITHFP-NEXT: li a1, 17 -; RV32I-WITHFP-NEXT: sw a1, 20(sp) -; RV32I-WITHFP-NEXT: sw a0, 24(sp) -; RV32I-WITHFP-NEXT: li a0, 16 -; RV32I-WITHFP-NEXT: lui a1, 262236 -; RV32I-WITHFP-NEXT: addi a1, a1, 655 -; RV32I-WITHFP-NEXT: lui a2, 377487 -; RV32I-WITHFP-NEXT: addi a2, a2, 1475 -; RV32I-WITHFP-NEXT: li a3, 15 -; RV32I-WITHFP-NEXT: sw a3, 0(sp) -; RV32I-WITHFP-NEXT: sw a2, 8(sp) -; RV32I-WITHFP-NEXT: sw a1, 12(sp) -; RV32I-WITHFP-NEXT: sw a0, 16(sp) -; RV32I-WITHFP-NEXT: lui a0, 262153 -; RV32I-WITHFP-NEXT: addi t0, a0, 491 -; RV32I-WITHFP-NEXT: lui a0, 545260 -; RV32I-WITHFP-NEXT: addi t1, a0, -1967 -; RV32I-WITHFP-NEXT: lui a0, 964690 -; RV32I-WITHFP-NEXT: addi t2, a0, -328 -; RV32I-WITHFP-NEXT: lui a0, 335544 -; RV32I-WITHFP-NEXT: addi t3, a0, 1311 -; RV32I-WITHFP-NEXT: lui a0, 688509 -; RV32I-WITHFP-NEXT: addi a5, a0, -2048 +; RV32I-WITHFP-NEXT: li a5, 18 +; RV32I-WITHFP-NEXT: li a6, 17 +; RV32I-WITHFP-NEXT: li a7, 16 +; RV32I-WITHFP-NEXT: lui t0, 262236 +; RV32I-WITHFP-NEXT: lui t1, 377487 +; RV32I-WITHFP-NEXT: li t2, 15 +; RV32I-WITHFP-NEXT: lui t3, 262153 +; RV32I-WITHFP-NEXT: lui t4, 545260 +; RV32I-WITHFP-NEXT: lui t5, 964690 +; RV32I-WITHFP-NEXT: lui t6, 335544 +; RV32I-WITHFP-NEXT: lui s1, 688509 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 11 ; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: li a3, 12 ; RV32I-WITHFP-NEXT: li a4, 13 +; RV32I-WITHFP-NEXT: sw a6, 20(sp) +; RV32I-WITHFP-NEXT: sw a5, 24(sp) ; RV32I-WITHFP-NEXT: li a6, 4 +; RV32I-WITHFP-NEXT: addi a5, t0, 655 +; RV32I-WITHFP-NEXT: addi t0, t1, 1475 +; RV32I-WITHFP-NEXT: sw t2, 0(sp) +; RV32I-WITHFP-NEXT: sw t0, 8(sp) +; RV32I-WITHFP-NEXT: sw a5, 12(sp) +; RV32I-WITHFP-NEXT: sw a7, 16(sp) ; RV32I-WITHFP-NEXT: li a7, 14 +; RV32I-WITHFP-NEXT: addi t0, t3, 491 +; RV32I-WITHFP-NEXT: addi t1, t4, -1967 +; RV32I-WITHFP-NEXT: addi t2, t5, -328 +; RV32I-WITHFP-NEXT: addi t3, t6, 1311 +; RV32I-WITHFP-NEXT: addi a5, s1, -2048 ; RV32I-WITHFP-NEXT: sw t3, -32(s0) ; RV32I-WITHFP-NEXT: sw t2, -28(s0) ; RV32I-WITHFP-NEXT: sw t1, -24(s0) @@ -229,6 +232,7 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: call callee_aligned_stack ; RV32I-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32I-WITHFP-NEXT: lw s1, 52(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 64 ; RV32I-WITHFP-NEXT: ret %1 = call i32 @callee_aligned_stack(i32 1, i32 11, diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll index 9e4c8a6e3320c..18916dd69eb43 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -86,15 +86,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; RV32I-FPELIM-NEXT: lw t1, 0(sp) ; RV32I-FPELIM-NEXT: andi a0, a0, 255 ; RV32I-FPELIM-NEXT: slli a1, a1, 16 +; RV32I-FPELIM-NEXT: xor a3, a3, a7 ; RV32I-FPELIM-NEXT: srli a1, a1, 16 ; RV32I-FPELIM-NEXT: add a0, a0, a2 ; RV32I-FPELIM-NEXT: add a0, a0, a1 -; RV32I-FPELIM-NEXT: xor a1, a4, t1 -; RV32I-FPELIM-NEXT: xor a2, a3, a7 -; RV32I-FPELIM-NEXT: or a1, a2, a1 -; RV32I-FPELIM-NEXT: seqz a1, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a5 +; RV32I-FPELIM-NEXT: xor a1, a4, t1 ; RV32I-FPELIM-NEXT: add a0, a0, a6 +; RV32I-FPELIM-NEXT: or a1, a3, a1 +; RV32I-FPELIM-NEXT: seqz a1, a1 ; RV32I-FPELIM-NEXT: add a0, a0, t0 ; RV32I-FPELIM-NEXT: add a0, a1, a0 ; RV32I-FPELIM-NEXT: ret @@ -109,15 +109,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; RV32I-WITHFP-NEXT: lw t1, 0(s0) ; RV32I-WITHFP-NEXT: andi a0, a0, 255 ; RV32I-WITHFP-NEXT: slli a1, a1, 16 +; RV32I-WITHFP-NEXT: xor a3, a3, a7 ; RV32I-WITHFP-NEXT: srli a1, a1, 16 ; RV32I-WITHFP-NEXT: add a0, a0, a2 ; RV32I-WITHFP-NEXT: add a0, a0, a1 -; RV32I-WITHFP-NEXT: xor a1, a4, t1 -; RV32I-WITHFP-NEXT: xor a2, a3, a7 -; RV32I-WITHFP-NEXT: or a1, a2, a1 -; RV32I-WITHFP-NEXT: seqz a1, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a5 +; RV32I-WITHFP-NEXT: xor a1, a4, t1 ; RV32I-WITHFP-NEXT: add a0, a0, a6 +; RV32I-WITHFP-NEXT: or a1, a3, a1 +; RV32I-WITHFP-NEXT: seqz a1, a1 ; RV32I-WITHFP-NEXT: add a0, a0, t0 ; RV32I-WITHFP-NEXT: add a0, a1, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind { define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-FPELIM-LABEL: callee_large_scalars: ; RV32I-FPELIM: # %bb.0: -; RV32I-FPELIM-NEXT: lw a2, 0(a0) -; RV32I-FPELIM-NEXT: lw a3, 4(a0) -; RV32I-FPELIM-NEXT: lw a4, 12(a1) +; RV32I-FPELIM-NEXT: lw a2, 0(a1) +; RV32I-FPELIM-NEXT: lw a3, 4(a1) +; RV32I-FPELIM-NEXT: lw a4, 8(a1) +; RV32I-FPELIM-NEXT: lw a1, 12(a1) ; RV32I-FPELIM-NEXT: lw a5, 12(a0) -; RV32I-FPELIM-NEXT: lw a6, 0(a1) -; RV32I-FPELIM-NEXT: lw a7, 4(a1) -; RV32I-FPELIM-NEXT: lw a1, 8(a1) -; RV32I-FPELIM-NEXT: lw a0, 8(a0) -; RV32I-FPELIM-NEXT: xor a4, a5, a4 -; RV32I-FPELIM-NEXT: xor a3, a3, a7 -; RV32I-FPELIM-NEXT: or a3, a3, a4 -; RV32I-FPELIM-NEXT: xor a0, a0, a1 -; RV32I-FPELIM-NEXT: xor a1, a2, a6 -; RV32I-FPELIM-NEXT: or a0, a1, a0 -; RV32I-FPELIM-NEXT: or a0, a0, a3 +; RV32I-FPELIM-NEXT: lw a6, 4(a0) +; RV32I-FPELIM-NEXT: lw a7, 8(a0) +; RV32I-FPELIM-NEXT: lw a0, 0(a0) +; RV32I-FPELIM-NEXT: xor a1, a5, a1 +; RV32I-FPELIM-NEXT: xor a3, a6, a3 +; RV32I-FPELIM-NEXT: xor a4, a7, a4 +; RV32I-FPELIM-NEXT: xor a0, a0, a2 +; RV32I-FPELIM-NEXT: or a1, a3, a1 +; RV32I-FPELIM-NEXT: or a0, a0, a4 +; RV32I-FPELIM-NEXT: or a0, a0, a1 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-WITHFP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 -; RV32I-WITHFP-NEXT: lw a2, 0(a0) -; RV32I-WITHFP-NEXT: lw a3, 4(a0) -; RV32I-WITHFP-NEXT: lw a4, 12(a1) +; RV32I-WITHFP-NEXT: lw a2, 0(a1) +; RV32I-WITHFP-NEXT: lw a3, 4(a1) +; RV32I-WITHFP-NEXT: lw a4, 8(a1) +; RV32I-WITHFP-NEXT: lw a1, 12(a1) ; RV32I-WITHFP-NEXT: lw a5, 12(a0) -; RV32I-WITHFP-NEXT: lw a6, 0(a1) -; RV32I-WITHFP-NEXT: lw a7, 4(a1) -; RV32I-WITHFP-NEXT: lw a1, 8(a1) -; RV32I-WITHFP-NEXT: lw a0, 8(a0) -; RV32I-WITHFP-NEXT: xor a4, a5, a4 -; RV32I-WITHFP-NEXT: xor a3, a3, a7 -; RV32I-WITHFP-NEXT: or a3, a3, a4 -; RV32I-WITHFP-NEXT: xor a0, a0, a1 -; RV32I-WITHFP-NEXT: xor a1, a2, a6 -; RV32I-WITHFP-NEXT: or a0, a1, a0 -; RV32I-WITHFP-NEXT: or a0, a0, a3 +; RV32I-WITHFP-NEXT: lw a6, 4(a0) +; RV32I-WITHFP-NEXT: lw a7, 8(a0) +; RV32I-WITHFP-NEXT: lw a0, 0(a0) +; RV32I-WITHFP-NEXT: xor a1, a5, a1 +; RV32I-WITHFP-NEXT: xor a3, a6, a3 +; RV32I-WITHFP-NEXT: xor a4, a7, a4 +; RV32I-WITHFP-NEXT: xor a0, a0, a2 +; RV32I-WITHFP-NEXT: or a1, a3, a1 +; RV32I-WITHFP-NEXT: or a0, a0, a4 +; RV32I-WITHFP-NEXT: or a0, a0, a1 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -245,13 +245,13 @@ define i32 @caller_large_scalars() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -48 ; RV32I-FPELIM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: lui a0, 524272 +; RV32I-FPELIM-NEXT: lui a1, 524272 +; RV32I-FPELIM-NEXT: li a2, 1 +; RV32I-FPELIM-NEXT: addi a0, sp, 24 ; RV32I-FPELIM-NEXT: sw zero, 0(sp) ; RV32I-FPELIM-NEXT: sw zero, 4(sp) ; RV32I-FPELIM-NEXT: sw zero, 8(sp) -; RV32I-FPELIM-NEXT: sw a0, 12(sp) -; RV32I-FPELIM-NEXT: li a2, 1 -; RV32I-FPELIM-NEXT: addi a0, sp, 24 +; RV32I-FPELIM-NEXT: sw a1, 12(sp) ; RV32I-FPELIM-NEXT: mv a1, sp ; RV32I-FPELIM-NEXT: sw a2, 24(sp) ; RV32I-FPELIM-NEXT: sw zero, 28(sp) @@ -268,13 +268,13 @@ define i32 @caller_large_scalars() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 40(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 48 -; RV32I-WITHFP-NEXT: lui a0, 524272 +; RV32I-WITHFP-NEXT: lui a1, 524272 +; RV32I-WITHFP-NEXT: li a2, 1 +; RV32I-WITHFP-NEXT: addi a0, s0, -24 ; RV32I-WITHFP-NEXT: sw zero, -48(s0) ; RV32I-WITHFP-NEXT: sw zero, -44(s0) ; RV32I-WITHFP-NEXT: sw zero, -40(s0) -; RV32I-WITHFP-NEXT: sw a0, -36(s0) -; RV32I-WITHFP-NEXT: li a2, 1 -; RV32I-WITHFP-NEXT: addi a0, s0, -24 +; RV32I-WITHFP-NEXT: sw a1, -36(s0) ; RV32I-WITHFP-NEXT: addi a1, s0, -48 ; RV32I-WITHFP-NEXT: sw a2, -24(s0) ; RV32I-WITHFP-NEXT: sw zero, -20(s0) @@ -299,18 +299,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-FPELIM-NEXT: lw a0, 4(sp) ; RV32I-FPELIM-NEXT: lw a1, 0(a7) ; RV32I-FPELIM-NEXT: lw a2, 4(a7) -; RV32I-FPELIM-NEXT: lw a3, 12(a0) +; RV32I-FPELIM-NEXT: lw a3, 8(a7) ; RV32I-FPELIM-NEXT: lw a4, 12(a7) -; RV32I-FPELIM-NEXT: lw a5, 0(a0) +; RV32I-FPELIM-NEXT: lw a5, 12(a0) ; RV32I-FPELIM-NEXT: lw a6, 4(a0) -; RV32I-FPELIM-NEXT: lw a0, 8(a0) -; RV32I-FPELIM-NEXT: lw a7, 8(a7) -; RV32I-FPELIM-NEXT: xor a3, a4, a3 +; RV32I-FPELIM-NEXT: lw a7, 8(a0) +; RV32I-FPELIM-NEXT: lw a0, 0(a0) +; RV32I-FPELIM-NEXT: xor a4, a4, a5 ; RV32I-FPELIM-NEXT: xor a2, a2, a6 -; RV32I-FPELIM-NEXT: or a2, a2, a3 -; RV32I-FPELIM-NEXT: xor a0, a7, a0 -; RV32I-FPELIM-NEXT: xor a1, a1, a5 -; RV32I-FPELIM-NEXT: or a0, a1, a0 +; RV32I-FPELIM-NEXT: xor a3, a3, a7 +; RV32I-FPELIM-NEXT: xor a0, a1, a0 +; RV32I-FPELIM-NEXT: or a2, a2, a4 +; RV32I-FPELIM-NEXT: or a0, a0, a3 ; RV32I-FPELIM-NEXT: or a0, a0, a2 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret @@ -324,18 +324,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-WITHFP-NEXT: lw a0, 4(s0) ; RV32I-WITHFP-NEXT: lw a1, 0(a7) ; RV32I-WITHFP-NEXT: lw a2, 4(a7) -; RV32I-WITHFP-NEXT: lw a3, 12(a0) +; RV32I-WITHFP-NEXT: lw a3, 8(a7) ; RV32I-WITHFP-NEXT: lw a4, 12(a7) -; RV32I-WITHFP-NEXT: lw a5, 0(a0) +; RV32I-WITHFP-NEXT: lw a5, 12(a0) ; RV32I-WITHFP-NEXT: lw a6, 4(a0) -; RV32I-WITHFP-NEXT: lw a0, 8(a0) -; RV32I-WITHFP-NEXT: lw a7, 8(a7) -; RV32I-WITHFP-NEXT: xor a3, a4, a3 +; RV32I-WITHFP-NEXT: lw a7, 8(a0) +; RV32I-WITHFP-NEXT: lw a0, 0(a0) +; RV32I-WITHFP-NEXT: xor a4, a4, a5 ; RV32I-WITHFP-NEXT: xor a2, a2, a6 -; RV32I-WITHFP-NEXT: or a2, a2, a3 -; RV32I-WITHFP-NEXT: xor a0, a7, a0 -; RV32I-WITHFP-NEXT: xor a1, a1, a5 -; RV32I-WITHFP-NEXT: or a0, a1, a0 +; RV32I-WITHFP-NEXT: xor a3, a3, a7 +; RV32I-WITHFP-NEXT: xor a0, a1, a0 +; RV32I-WITHFP-NEXT: or a2, a2, a4 +; RV32I-WITHFP-NEXT: or a0, a0, a3 ; RV32I-WITHFP-NEXT: or a0, a0, a2 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -353,25 +353,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: addi a0, sp, 16 -; RV32I-FPELIM-NEXT: li a1, 9 -; RV32I-FPELIM-NEXT: sw a1, 0(sp) -; RV32I-FPELIM-NEXT: sw a0, 4(sp) -; RV32I-FPELIM-NEXT: lui a0, 524272 -; RV32I-FPELIM-NEXT: sw zero, 16(sp) -; RV32I-FPELIM-NEXT: sw zero, 20(sp) -; RV32I-FPELIM-NEXT: sw zero, 24(sp) -; RV32I-FPELIM-NEXT: sw a0, 28(sp) -; RV32I-FPELIM-NEXT: li t0, 8 +; RV32I-FPELIM-NEXT: addi a6, sp, 16 +; RV32I-FPELIM-NEXT: li a7, 9 +; RV32I-FPELIM-NEXT: lui t0, 524272 +; RV32I-FPELIM-NEXT: li t1, 8 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 2 ; RV32I-FPELIM-NEXT: li a2, 3 ; RV32I-FPELIM-NEXT: li a3, 4 ; RV32I-FPELIM-NEXT: li a4, 5 ; RV32I-FPELIM-NEXT: li a5, 6 +; RV32I-FPELIM-NEXT: sw a7, 0(sp) +; RV32I-FPELIM-NEXT: sw a6, 4(sp) ; RV32I-FPELIM-NEXT: li a6, 7 +; RV32I-FPELIM-NEXT: sw zero, 16(sp) +; RV32I-FPELIM-NEXT: sw zero, 20(sp) +; RV32I-FPELIM-NEXT: sw zero, 24(sp) +; RV32I-FPELIM-NEXT: sw t0, 28(sp) ; RV32I-FPELIM-NEXT: addi a7, sp, 40 -; RV32I-FPELIM-NEXT: sw t0, 40(sp) +; RV32I-FPELIM-NEXT: sw t1, 40(sp) ; RV32I-FPELIM-NEXT: sw zero, 44(sp) ; RV32I-FPELIM-NEXT: sw zero, 48(sp) ; RV32I-FPELIM-NEXT: sw zero, 52(sp) @@ -386,25 +386,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: addi a0, s0, -48 -; RV32I-WITHFP-NEXT: li a1, 9 -; RV32I-WITHFP-NEXT: sw a1, 0(sp) -; RV32I-WITHFP-NEXT: sw a0, 4(sp) -; RV32I-WITHFP-NEXT: lui a0, 524272 -; RV32I-WITHFP-NEXT: sw zero, -48(s0) -; RV32I-WITHFP-NEXT: sw zero, -44(s0) -; RV32I-WITHFP-NEXT: sw zero, -40(s0) -; RV32I-WITHFP-NEXT: sw a0, -36(s0) -; RV32I-WITHFP-NEXT: li t0, 8 +; RV32I-WITHFP-NEXT: addi a6, s0, -48 +; RV32I-WITHFP-NEXT: li a7, 9 +; RV32I-WITHFP-NEXT: lui t0, 524272 +; RV32I-WITHFP-NEXT: li t1, 8 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 2 ; RV32I-WITHFP-NEXT: li a2, 3 ; RV32I-WITHFP-NEXT: li a3, 4 ; RV32I-WITHFP-NEXT: li a4, 5 ; RV32I-WITHFP-NEXT: li a5, 6 +; RV32I-WITHFP-NEXT: sw a7, 0(sp) +; RV32I-WITHFP-NEXT: sw a6, 4(sp) ; RV32I-WITHFP-NEXT: li a6, 7 +; RV32I-WITHFP-NEXT: sw zero, -48(s0) +; RV32I-WITHFP-NEXT: sw zero, -44(s0) +; RV32I-WITHFP-NEXT: sw zero, -40(s0) +; RV32I-WITHFP-NEXT: sw t0, -36(s0) ; RV32I-WITHFP-NEXT: addi a7, s0, -24 -; RV32I-WITHFP-NEXT: sw t0, -24(s0) +; RV32I-WITHFP-NEXT: sw t1, -24(s0) ; RV32I-WITHFP-NEXT: sw zero, -20(s0) ; RV32I-WITHFP-NEXT: sw zero, -16(s0) ; RV32I-WITHFP-NEXT: sw zero, -12(s0) @@ -664,34 +664,34 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: li a0, 19 -; RV32I-FPELIM-NEXT: li a1, 18 -; RV32I-FPELIM-NEXT: sw a1, 20(sp) -; RV32I-FPELIM-NEXT: sw a0, 24(sp) -; RV32I-FPELIM-NEXT: li a0, 17 -; RV32I-FPELIM-NEXT: li a1, 16 -; RV32I-FPELIM-NEXT: li a2, 15 -; RV32I-FPELIM-NEXT: sw a2, 0(sp) -; RV32I-FPELIM-NEXT: sw a1, 8(sp) -; RV32I-FPELIM-NEXT: sw zero, 12(sp) -; RV32I-FPELIM-NEXT: sw a0, 16(sp) -; RV32I-FPELIM-NEXT: lui a0, 262153 -; RV32I-FPELIM-NEXT: addi t0, a0, 491 -; RV32I-FPELIM-NEXT: lui a0, 545260 -; RV32I-FPELIM-NEXT: addi t1, a0, -1967 -; RV32I-FPELIM-NEXT: lui a0, 964690 -; RV32I-FPELIM-NEXT: addi t2, a0, -328 -; RV32I-FPELIM-NEXT: lui a0, 335544 -; RV32I-FPELIM-NEXT: addi t3, a0, 1311 -; RV32I-FPELIM-NEXT: lui a0, 688509 -; RV32I-FPELIM-NEXT: addi a5, a0, -2048 +; RV32I-FPELIM-NEXT: li a5, 19 +; RV32I-FPELIM-NEXT: li a6, 18 +; RV32I-FPELIM-NEXT: li a7, 17 +; RV32I-FPELIM-NEXT: li t0, 16 +; RV32I-FPELIM-NEXT: li t1, 15 +; RV32I-FPELIM-NEXT: lui t2, 262153 +; RV32I-FPELIM-NEXT: lui t3, 545260 +; RV32I-FPELIM-NEXT: lui t4, 964690 +; RV32I-FPELIM-NEXT: lui t5, 335544 +; RV32I-FPELIM-NEXT: lui t6, 688509 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 11 ; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: li a3, 12 ; RV32I-FPELIM-NEXT: li a4, 13 +; RV32I-FPELIM-NEXT: sw a6, 20(sp) +; RV32I-FPELIM-NEXT: sw a5, 24(sp) ; RV32I-FPELIM-NEXT: li a6, 4 +; RV32I-FPELIM-NEXT: sw t1, 0(sp) +; RV32I-FPELIM-NEXT: sw t0, 8(sp) +; RV32I-FPELIM-NEXT: sw zero, 12(sp) +; RV32I-FPELIM-NEXT: sw a7, 16(sp) ; RV32I-FPELIM-NEXT: li a7, 14 +; RV32I-FPELIM-NEXT: addi t0, t2, 491 +; RV32I-FPELIM-NEXT: addi t1, t3, -1967 +; RV32I-FPELIM-NEXT: addi t2, t4, -328 +; RV32I-FPELIM-NEXT: addi t3, t5, 1311 +; RV32I-FPELIM-NEXT: addi a5, t6, -2048 ; RV32I-FPELIM-NEXT: sw t3, 32(sp) ; RV32I-FPELIM-NEXT: sw t2, 36(sp) ; RV32I-FPELIM-NEXT: sw t1, 40(sp) @@ -707,34 +707,34 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: li a0, 19 -; RV32I-WITHFP-NEXT: li a1, 18 -; RV32I-WITHFP-NEXT: sw a1, 20(sp) -; RV32I-WITHFP-NEXT: sw a0, 24(sp) -; RV32I-WITHFP-NEXT: li a0, 17 -; RV32I-WITHFP-NEXT: li a1, 16 -; RV32I-WITHFP-NEXT: li a2, 15 -; RV32I-WITHFP-NEXT: sw a2, 0(sp) -; RV32I-WITHFP-NEXT: sw a1, 8(sp) -; RV32I-WITHFP-NEXT: sw zero, 12(sp) -; RV32I-WITHFP-NEXT: sw a0, 16(sp) -; RV32I-WITHFP-NEXT: lui a0, 262153 -; RV32I-WITHFP-NEXT: addi t0, a0, 491 -; RV32I-WITHFP-NEXT: lui a0, 545260 -; RV32I-WITHFP-NEXT: addi t1, a0, -1967 -; RV32I-WITHFP-NEXT: lui a0, 964690 -; RV32I-WITHFP-NEXT: addi t2, a0, -328 -; RV32I-WITHFP-NEXT: lui a0, 335544 -; RV32I-WITHFP-NEXT: addi t3, a0, 1311 -; RV32I-WITHFP-NEXT: lui a0, 688509 -; RV32I-WITHFP-NEXT: addi a5, a0, -2048 +; RV32I-WITHFP-NEXT: li a5, 19 +; RV32I-WITHFP-NEXT: li a6, 18 +; RV32I-WITHFP-NEXT: li a7, 17 +; RV32I-WITHFP-NEXT: li t0, 16 +; RV32I-WITHFP-NEXT: li t1, 15 +; RV32I-WITHFP-NEXT: lui t2, 262153 +; RV32I-WITHFP-NEXT: lui t3, 545260 +; RV32I-WITHFP-NEXT: lui t4, 964690 +; RV32I-WITHFP-NEXT: lui t5, 335544 +; RV32I-WITHFP-NEXT: lui t6, 688509 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 11 ; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: li a3, 12 ; RV32I-WITHFP-NEXT: li a4, 13 +; RV32I-WITHFP-NEXT: sw a6, 20(sp) +; RV32I-WITHFP-NEXT: sw a5, 24(sp) ; RV32I-WITHFP-NEXT: li a6, 4 +; RV32I-WITHFP-NEXT: sw t1, 0(sp) +; RV32I-WITHFP-NEXT: sw t0, 8(sp) +; RV32I-WITHFP-NEXT: sw zero, 12(sp) +; RV32I-WITHFP-NEXT: sw a7, 16(sp) ; RV32I-WITHFP-NEXT: li a7, 14 +; RV32I-WITHFP-NEXT: addi t0, t2, 491 +; RV32I-WITHFP-NEXT: addi t1, t3, -1967 +; RV32I-WITHFP-NEXT: addi t2, t4, -328 +; RV32I-WITHFP-NEXT: addi t3, t5, 1311 +; RV32I-WITHFP-NEXT: addi a5, t6, -2048 ; RV32I-WITHFP-NEXT: sw t3, -32(s0) ; RV32I-WITHFP-NEXT: sw t2, -28(s0) ; RV32I-WITHFP-NEXT: sw t1, -24(s0) diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll index 1321413fbc57e..7630d5b8f77ef 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll @@ -97,21 +97,21 @@ define i32 @caller_double_in_gpr_exhausted_fprs() nounwind { ; RV32-ILP32D-NEXT: addi sp, sp, -16 ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_0) +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI5_1) ; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI5_0)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_1) -; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI5_1)(a0) ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_2) +; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI5_1)(a1) +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI5_3) ; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI5_2)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_3) -; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI5_3)(a0) ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_4) +; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI5_3)(a1) +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI5_5) ; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI5_4)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_5) -; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI5_5)(a0) ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_6) +; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI5_5)(a1) +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI5_7) ; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI5_6)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI5_7) -; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI5_7)(a0) +; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI5_7)(a1) ; RV32-ILP32D-NEXT: lui a1, 262688 ; RV32-ILP32D-NEXT: li a0, 0 ; RV32-ILP32D-NEXT: call callee_double_in_gpr_exhausted_fprs @@ -149,20 +149,20 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32D-NEXT: lui a1, 262816 ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_0) +; RV32-ILP32D-NEXT: lui a2, %hi(.LCPI7_1) +; RV32-ILP32D-NEXT: lui a3, %hi(.LCPI7_2) +; RV32-ILP32D-NEXT: lui a4, %hi(.LCPI7_3) +; RV32-ILP32D-NEXT: lui a5, %hi(.LCPI7_4) +; RV32-ILP32D-NEXT: lui a6, %hi(.LCPI7_5) +; RV32-ILP32D-NEXT: lui a7, %hi(.LCPI7_6) ; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI7_0)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_1) -; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI7_1)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_2) -; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI7_2)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_3) -; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI7_3)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_4) -; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI7_4)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_5) -; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI7_5)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_6) -; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI7_6)(a0) ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_7) +; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI7_1)(a2) +; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI7_2)(a3) +; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI7_3)(a4) +; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI7_4)(a5) +; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI7_5)(a6) +; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI7_6)(a7) ; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI7_7)(a0) ; RV32-ILP32D-NEXT: li a0, 1 ; RV32-ILP32D-NEXT: li a2, 3 @@ -205,22 +205,22 @@ define i32 @caller_double_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32D-NEXT: lui a1, 262816 ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_0) +; RV32-ILP32D-NEXT: lui a2, %hi(.LCPI9_1) +; RV32-ILP32D-NEXT: lui a3, %hi(.LCPI9_2) +; RV32-ILP32D-NEXT: lui a4, %hi(.LCPI9_3) +; RV32-ILP32D-NEXT: lui a5, %hi(.LCPI9_4) +; RV32-ILP32D-NEXT: lui a6, %hi(.LCPI9_5) +; RV32-ILP32D-NEXT: lui a7, %hi(.LCPI9_6) ; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI9_0)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_1) -; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI9_1)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_2) -; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI9_2)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_3) -; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI9_3)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_4) -; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI9_4)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_5) -; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI9_5)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_6) -; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI9_6)(a0) -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_7) -; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI9_7)(a0) +; RV32-ILP32D-NEXT: lui t0, %hi(.LCPI9_7) +; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI9_1)(a2) ; RV32-ILP32D-NEXT: li a0, 1 +; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI9_2)(a3) +; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI9_3)(a4) +; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI9_4)(a5) +; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI9_5)(a6) +; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI9_6)(a7) +; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI9_7)(t0) ; RV32-ILP32D-NEXT: li a2, 3 ; RV32-ILP32D-NEXT: li a4, 5 ; RV32-ILP32D-NEXT: li a6, 7 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll index 2b779cd34a807..e16bed5400300 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll @@ -694,39 +694,39 @@ define void @caller_aligned_stack() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: li a0, 18 -; ILP32E-FPELIM-NEXT: li a1, 17 -; ILP32E-FPELIM-NEXT: li a2, 16 -; ILP32E-FPELIM-NEXT: lui a3, 262236 -; ILP32E-FPELIM-NEXT: addi a3, a3, 655 -; ILP32E-FPELIM-NEXT: sw a3, 16(sp) -; ILP32E-FPELIM-NEXT: sw a2, 20(sp) -; ILP32E-FPELIM-NEXT: sw a1, 24(sp) -; ILP32E-FPELIM-NEXT: sw a0, 28(sp) -; ILP32E-FPELIM-NEXT: lui a0, 377487 -; ILP32E-FPELIM-NEXT: addi a0, a0, 1475 -; ILP32E-FPELIM-NEXT: li a1, 15 -; ILP32E-FPELIM-NEXT: li a2, 14 -; ILP32E-FPELIM-NEXT: li a3, 4 -; ILP32E-FPELIM-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-NEXT: sw a1, 8(sp) -; ILP32E-FPELIM-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-NEXT: lui a0, 262153 -; ILP32E-FPELIM-NEXT: addi a6, a0, 491 -; ILP32E-FPELIM-NEXT: lui a0, 545260 -; ILP32E-FPELIM-NEXT: addi a7, a0, -1967 -; ILP32E-FPELIM-NEXT: lui a0, 964690 -; ILP32E-FPELIM-NEXT: addi t0, a0, -328 -; ILP32E-FPELIM-NEXT: lui a0, 335544 -; ILP32E-FPELIM-NEXT: addi t1, a0, 1311 -; ILP32E-FPELIM-NEXT: lui a0, 688509 -; ILP32E-FPELIM-NEXT: addi a5, a0, -2048 +; ILP32E-FPELIM-NEXT: li a3, 18 +; ILP32E-FPELIM-NEXT: li a4, 17 +; ILP32E-FPELIM-NEXT: li a5, 16 +; ILP32E-FPELIM-NEXT: lui a6, 262236 +; ILP32E-FPELIM-NEXT: lui a7, 377487 +; ILP32E-FPELIM-NEXT: li t0, 15 +; ILP32E-FPELIM-NEXT: li t1, 14 +; ILP32E-FPELIM-NEXT: li t2, 4 +; ILP32E-FPELIM-NEXT: lui t3, 262153 +; ILP32E-FPELIM-NEXT: lui t4, 545260 +; ILP32E-FPELIM-NEXT: lui t5, 964690 +; ILP32E-FPELIM-NEXT: lui t6, 335544 +; ILP32E-FPELIM-NEXT: lui s2, 688509 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 11 ; ILP32E-FPELIM-NEXT: addi a2, sp, 32 +; ILP32E-FPELIM-NEXT: addi a6, a6, 655 +; ILP32E-FPELIM-NEXT: sw a6, 16(sp) +; ILP32E-FPELIM-NEXT: sw a5, 20(sp) +; ILP32E-FPELIM-NEXT: sw a4, 24(sp) +; ILP32E-FPELIM-NEXT: sw a3, 28(sp) ; ILP32E-FPELIM-NEXT: li a3, 12 +; ILP32E-FPELIM-NEXT: addi a4, a7, 1475 +; ILP32E-FPELIM-NEXT: sw t2, 0(sp) +; ILP32E-FPELIM-NEXT: sw t1, 4(sp) +; ILP32E-FPELIM-NEXT: sw t0, 8(sp) +; ILP32E-FPELIM-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-NEXT: li a4, 13 +; ILP32E-FPELIM-NEXT: addi a6, t3, 491 +; ILP32E-FPELIM-NEXT: addi a7, t4, -1967 +; ILP32E-FPELIM-NEXT: addi t0, t5, -328 +; ILP32E-FPELIM-NEXT: addi t1, t6, 1311 +; ILP32E-FPELIM-NEXT: addi a5, s2, -2048 ; ILP32E-FPELIM-NEXT: sw t1, 32(sp) ; ILP32E-FPELIM-NEXT: sw t0, 36(sp) ; ILP32E-FPELIM-NEXT: sw a7, 40(sp) @@ -753,39 +753,39 @@ define void @caller_aligned_stack() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: li a0, 18 -; ILP32E-WITHFP-NEXT: li a1, 17 -; ILP32E-WITHFP-NEXT: li a2, 16 -; ILP32E-WITHFP-NEXT: lui a3, 262236 -; ILP32E-WITHFP-NEXT: addi a3, a3, 655 -; ILP32E-WITHFP-NEXT: sw a3, 16(sp) -; ILP32E-WITHFP-NEXT: sw a2, 20(sp) -; ILP32E-WITHFP-NEXT: sw a1, 24(sp) -; ILP32E-WITHFP-NEXT: sw a0, 28(sp) -; ILP32E-WITHFP-NEXT: lui a0, 377487 -; ILP32E-WITHFP-NEXT: addi a0, a0, 1475 -; ILP32E-WITHFP-NEXT: li a1, 15 -; ILP32E-WITHFP-NEXT: li a2, 14 -; ILP32E-WITHFP-NEXT: li a3, 4 -; ILP32E-WITHFP-NEXT: sw a3, 0(sp) -; ILP32E-WITHFP-NEXT: sw a2, 4(sp) -; ILP32E-WITHFP-NEXT: sw a1, 8(sp) -; ILP32E-WITHFP-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-NEXT: lui a0, 262153 -; ILP32E-WITHFP-NEXT: addi a6, a0, 491 -; ILP32E-WITHFP-NEXT: lui a0, 545260 -; ILP32E-WITHFP-NEXT: addi a7, a0, -1967 -; ILP32E-WITHFP-NEXT: lui a0, 964690 -; ILP32E-WITHFP-NEXT: addi t0, a0, -328 -; ILP32E-WITHFP-NEXT: lui a0, 335544 -; ILP32E-WITHFP-NEXT: addi t1, a0, 1311 -; ILP32E-WITHFP-NEXT: lui a0, 688509 -; ILP32E-WITHFP-NEXT: addi a5, a0, -2048 +; ILP32E-WITHFP-NEXT: li a3, 18 +; ILP32E-WITHFP-NEXT: li a4, 17 +; ILP32E-WITHFP-NEXT: li a5, 16 +; ILP32E-WITHFP-NEXT: lui a6, 262236 +; ILP32E-WITHFP-NEXT: lui a7, 377487 +; ILP32E-WITHFP-NEXT: li t0, 15 +; ILP32E-WITHFP-NEXT: li t1, 14 +; ILP32E-WITHFP-NEXT: li t2, 4 +; ILP32E-WITHFP-NEXT: lui t3, 262153 +; ILP32E-WITHFP-NEXT: lui t4, 545260 +; ILP32E-WITHFP-NEXT: lui t5, 964690 +; ILP32E-WITHFP-NEXT: lui t6, 335544 +; ILP32E-WITHFP-NEXT: lui s2, 688509 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 11 ; ILP32E-WITHFP-NEXT: addi a2, sp, 32 +; ILP32E-WITHFP-NEXT: addi a6, a6, 655 +; ILP32E-WITHFP-NEXT: sw a6, 16(sp) +; ILP32E-WITHFP-NEXT: sw a5, 20(sp) +; ILP32E-WITHFP-NEXT: sw a4, 24(sp) +; ILP32E-WITHFP-NEXT: sw a3, 28(sp) ; ILP32E-WITHFP-NEXT: li a3, 12 +; ILP32E-WITHFP-NEXT: addi a4, a7, 1475 +; ILP32E-WITHFP-NEXT: sw t2, 0(sp) +; ILP32E-WITHFP-NEXT: sw t1, 4(sp) +; ILP32E-WITHFP-NEXT: sw t0, 8(sp) +; ILP32E-WITHFP-NEXT: sw a4, 12(sp) ; ILP32E-WITHFP-NEXT: li a4, 13 +; ILP32E-WITHFP-NEXT: addi a6, t3, 491 +; ILP32E-WITHFP-NEXT: addi a7, t4, -1967 +; ILP32E-WITHFP-NEXT: addi t0, t5, -328 +; ILP32E-WITHFP-NEXT: addi t1, t6, 1311 +; ILP32E-WITHFP-NEXT: addi a5, s2, -2048 ; ILP32E-WITHFP-NEXT: sw t1, 32(sp) ; ILP32E-WITHFP-NEXT: sw t0, 36(sp) ; ILP32E-WITHFP-NEXT: sw a7, 40(sp) @@ -812,39 +812,39 @@ define void @caller_aligned_stack() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 18 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 17 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a3, 262236 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a3, a3, 655 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 16(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 20(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 24(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 28(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 377487 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, a0, 1475 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 15 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 14 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 262153 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, a0, 491 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 545260 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a7, a0, -1967 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 964690 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t0, a0, -328 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 335544 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t1, a0, 1311 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 688509 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, a0, -2048 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 18 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 17 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 16 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a6, 262236 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a7, 377487 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t0, 15 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t1, 14 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t2, 4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t3, 262153 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t4, 545260 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t5, 964690 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t6, 335544 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui s2, 688509 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 11 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a2, sp, 32 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, a6, 655 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 16(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 20(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 28(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 12 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a4, a7, 1475 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t2, 0(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 8(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 13 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, t3, 491 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a7, t4, -1967 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t0, t5, -328 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t1, t6, 1311 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, s2, -2048 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 32(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 36(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 40(sp) @@ -867,39 +867,39 @@ define void @caller_aligned_stack() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 18 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 17 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a3, 262236 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a3, a3, 655 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 16(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 20(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 24(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 28(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 377487 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, a0, 1475 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 15 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 14 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 0(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 4(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 262153 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, a0, 491 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 545260 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a7, a0, -1967 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 964690 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t0, a0, -328 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 335544 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t1, a0, 1311 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 688509 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, a0, -2048 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 18 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 17 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 16 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a6, 262236 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a7, 377487 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t0, 15 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t1, 14 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t2, 4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t3, 262153 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t4, 545260 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t5, 964690 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t6, 335544 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui s2, 688509 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 11 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a2, sp, 32 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, a6, 655 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 16(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 20(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 24(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 28(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 12 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a4, a7, 1475 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t2, 0(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 4(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 8(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 12(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 13 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, t3, 491 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a7, t4, -1967 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t0, t5, -328 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t1, t6, 1311 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, s2, -2048 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 32(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 36(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 40(sp) @@ -1157,12 +1157,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ILP32E-FPELIM-NEXT: srli a1, a1, 16 ; ILP32E-FPELIM-NEXT: add a0, a0, a2 ; ILP32E-FPELIM-NEXT: add a0, a0, a1 +; ILP32E-FPELIM-NEXT: add a0, a0, a5 ; ILP32E-FPELIM-NEXT: xor a1, a4, t1 ; ILP32E-FPELIM-NEXT: xor a2, a3, t0 +; ILP32E-FPELIM-NEXT: add a0, a0, a7 ; ILP32E-FPELIM-NEXT: or a1, a2, a1 ; ILP32E-FPELIM-NEXT: seqz a1, a1 -; ILP32E-FPELIM-NEXT: add a0, a0, a5 -; ILP32E-FPELIM-NEXT: add a0, a0, a7 ; ILP32E-FPELIM-NEXT: add a0, a0, a6 ; ILP32E-FPELIM-NEXT: add a0, a1, a0 ; ILP32E-FPELIM-NEXT: ret @@ -1186,12 +1186,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ILP32E-WITHFP-NEXT: srli a1, a1, 16 ; ILP32E-WITHFP-NEXT: add a0, a0, a2 ; ILP32E-WITHFP-NEXT: add a0, a0, a1 +; ILP32E-WITHFP-NEXT: add a0, a0, a5 ; ILP32E-WITHFP-NEXT: xor a1, a4, t1 ; ILP32E-WITHFP-NEXT: xor a2, a3, t0 +; ILP32E-WITHFP-NEXT: add a0, a0, a7 ; ILP32E-WITHFP-NEXT: or a1, a2, a1 ; ILP32E-WITHFP-NEXT: seqz a1, a1 -; ILP32E-WITHFP-NEXT: add a0, a0, a5 -; ILP32E-WITHFP-NEXT: add a0, a0, a7 ; ILP32E-WITHFP-NEXT: add a0, a0, a6 ; ILP32E-WITHFP-NEXT: add a0, a1, a0 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 @@ -1214,12 +1214,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: srli a1, a1, 16 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a5 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a4, t1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a2, a3, t0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a7 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a1, a2, a1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: seqz a1, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a5 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a7 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a6 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a1, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret @@ -1241,12 +1241,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: srli a1, a1, 16 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a5 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a4, t1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a2, a3, t0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a7 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a1, a2, a1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: seqz a1, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a5 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a7 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a6 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a1, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 @@ -1386,21 +1386,21 @@ define i32 @caller_many_scalars() { define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-FPELIM-LABEL: callee_large_scalars: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: lw a2, 0(a0) -; ILP32E-FPELIM-NEXT: lw a3, 4(a0) -; ILP32E-FPELIM-NEXT: lw a4, 12(a1) +; ILP32E-FPELIM-NEXT: lw a2, 0(a1) +; ILP32E-FPELIM-NEXT: lw a3, 4(a1) +; ILP32E-FPELIM-NEXT: lw a4, 8(a1) +; ILP32E-FPELIM-NEXT: lw a1, 12(a1) ; ILP32E-FPELIM-NEXT: lw a5, 12(a0) -; ILP32E-FPELIM-NEXT: lw a6, 0(a1) -; ILP32E-FPELIM-NEXT: lw a7, 4(a1) -; ILP32E-FPELIM-NEXT: lw a1, 8(a1) -; ILP32E-FPELIM-NEXT: lw a0, 8(a0) -; ILP32E-FPELIM-NEXT: xor a4, a5, a4 -; ILP32E-FPELIM-NEXT: xor a3, a3, a7 -; ILP32E-FPELIM-NEXT: or a3, a3, a4 -; ILP32E-FPELIM-NEXT: xor a0, a0, a1 -; ILP32E-FPELIM-NEXT: xor a1, a2, a6 -; ILP32E-FPELIM-NEXT: or a0, a1, a0 -; ILP32E-FPELIM-NEXT: or a0, a0, a3 +; ILP32E-FPELIM-NEXT: lw a6, 4(a0) +; ILP32E-FPELIM-NEXT: lw a7, 8(a0) +; ILP32E-FPELIM-NEXT: lw a0, 0(a0) +; ILP32E-FPELIM-NEXT: xor a1, a5, a1 +; ILP32E-FPELIM-NEXT: xor a3, a6, a3 +; ILP32E-FPELIM-NEXT: xor a4, a7, a4 +; ILP32E-FPELIM-NEXT: xor a0, a0, a2 +; ILP32E-FPELIM-NEXT: or a1, a3, a1 +; ILP32E-FPELIM-NEXT: or a0, a0, a4 +; ILP32E-FPELIM-NEXT: or a0, a0, a1 ; ILP32E-FPELIM-NEXT: seqz a0, a0 ; ILP32E-FPELIM-NEXT: ret ; @@ -1414,21 +1414,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 ; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 -; ILP32E-WITHFP-NEXT: lw a2, 0(a0) -; ILP32E-WITHFP-NEXT: lw a3, 4(a0) -; ILP32E-WITHFP-NEXT: lw a4, 12(a1) +; ILP32E-WITHFP-NEXT: lw a2, 0(a1) +; ILP32E-WITHFP-NEXT: lw a3, 4(a1) +; ILP32E-WITHFP-NEXT: lw a4, 8(a1) +; ILP32E-WITHFP-NEXT: lw a1, 12(a1) ; ILP32E-WITHFP-NEXT: lw a5, 12(a0) -; ILP32E-WITHFP-NEXT: lw a6, 0(a1) -; ILP32E-WITHFP-NEXT: lw a7, 4(a1) -; ILP32E-WITHFP-NEXT: lw a1, 8(a1) -; ILP32E-WITHFP-NEXT: lw a0, 8(a0) -; ILP32E-WITHFP-NEXT: xor a4, a5, a4 -; ILP32E-WITHFP-NEXT: xor a3, a3, a7 -; ILP32E-WITHFP-NEXT: or a3, a3, a4 -; ILP32E-WITHFP-NEXT: xor a0, a0, a1 -; ILP32E-WITHFP-NEXT: xor a1, a2, a6 -; ILP32E-WITHFP-NEXT: or a0, a1, a0 -; ILP32E-WITHFP-NEXT: or a0, a0, a3 +; ILP32E-WITHFP-NEXT: lw a6, 4(a0) +; ILP32E-WITHFP-NEXT: lw a7, 8(a0) +; ILP32E-WITHFP-NEXT: lw a0, 0(a0) +; ILP32E-WITHFP-NEXT: xor a1, a5, a1 +; ILP32E-WITHFP-NEXT: xor a3, a6, a3 +; ILP32E-WITHFP-NEXT: xor a4, a7, a4 +; ILP32E-WITHFP-NEXT: xor a0, a0, a2 +; ILP32E-WITHFP-NEXT: or a1, a3, a1 +; ILP32E-WITHFP-NEXT: or a0, a0, a4 +; ILP32E-WITHFP-NEXT: or a0, a0, a1 ; ILP32E-WITHFP-NEXT: seqz a0, a0 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload @@ -1441,21 +1441,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars: ; ILP32E-FPELIM-SAVE-RESTORE: # %bb.0: -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a2, 0(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 4(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 12(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a2, 0(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 4(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 8(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 12(a1) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 12(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 0(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 4(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 8(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 8(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a5, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a3, a7 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a3, a3, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a0, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a2, a6 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a1, a0 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 4(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 8(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 0(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a5, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a6, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a7, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a0, a2 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a1, a3, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret ; @@ -1467,21 +1467,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_offset s0, -8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a2, 0(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 4(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 12(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a2, 0(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 4(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 8(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 12(a1) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 12(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 0(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 4(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 8(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 8(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a5, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a3, a7 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a3, a3, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a0, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a2, a6 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a1, a0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 4(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 8(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 0(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a5, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a6, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a7, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a0, a2 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a1, a3, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: tail __riscv_restore_1 @@ -1503,13 +1503,13 @@ define i32 @caller_large_scalars() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 48 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: lui a0, 524272 +; ILP32E-FPELIM-NEXT: lui a1, 524272 +; ILP32E-FPELIM-NEXT: li a2, 1 +; ILP32E-FPELIM-NEXT: addi a0, sp, 24 ; ILP32E-FPELIM-NEXT: sw zero, 0(sp) ; ILP32E-FPELIM-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-NEXT: sw zero, 8(sp) -; ILP32E-FPELIM-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-NEXT: li a2, 1 -; ILP32E-FPELIM-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-NEXT: sw a1, 12(sp) ; ILP32E-FPELIM-NEXT: mv a1, sp ; ILP32E-FPELIM-NEXT: sw a2, 24(sp) ; ILP32E-FPELIM-NEXT: sw zero, 28(sp) @@ -1537,13 +1537,13 @@ define i32 @caller_large_scalars() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 48 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: lui a0, 524272 +; ILP32E-WITHFP-NEXT: lui a1, 524272 +; ILP32E-WITHFP-NEXT: li a2, 1 +; ILP32E-WITHFP-NEXT: addi a0, sp, 24 ; ILP32E-WITHFP-NEXT: sw zero, 0(sp) ; ILP32E-WITHFP-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-NEXT: sw zero, 8(sp) -; ILP32E-WITHFP-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-NEXT: li a2, 1 -; ILP32E-WITHFP-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-NEXT: sw a1, 12(sp) ; ILP32E-WITHFP-NEXT: mv a1, sp ; ILP32E-WITHFP-NEXT: sw a2, 24(sp) ; ILP32E-WITHFP-NEXT: sw zero, 28(sp) @@ -1571,13 +1571,13 @@ define i32 @caller_large_scalars() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 48 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 524272 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a1, 524272 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, sp, 24 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 0(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: mv a1, sp ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 24(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 28(sp) @@ -1601,13 +1601,13 @@ define i32 @caller_large_scalars() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 48 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 524272 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a1, 524272 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, sp, 24 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 0(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 12(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: mv a1, sp ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 24(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 28(sp) @@ -1630,23 +1630,23 @@ define i32 @caller_large_scalars() { define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i128 %h, i32 %i, fp128 %j) { ; ILP32E-FPELIM-LABEL: callee_large_scalars_exhausted_regs: ; ILP32E-FPELIM: # %bb.0: -; ILP32E-FPELIM-NEXT: lw a0, 4(sp) -; ILP32E-FPELIM-NEXT: lw a1, 12(sp) +; ILP32E-FPELIM-NEXT: lw a0, 12(sp) +; ILP32E-FPELIM-NEXT: lw a1, 4(sp) ; ILP32E-FPELIM-NEXT: lw a2, 0(a0) ; ILP32E-FPELIM-NEXT: lw a3, 4(a0) -; ILP32E-FPELIM-NEXT: lw a4, 12(a1) -; ILP32E-FPELIM-NEXT: lw a5, 12(a0) -; ILP32E-FPELIM-NEXT: lw a6, 0(a1) -; ILP32E-FPELIM-NEXT: lw a7, 4(a1) -; ILP32E-FPELIM-NEXT: lw a1, 8(a1) -; ILP32E-FPELIM-NEXT: lw a0, 8(a0) -; ILP32E-FPELIM-NEXT: xor a4, a5, a4 -; ILP32E-FPELIM-NEXT: xor a3, a3, a7 -; ILP32E-FPELIM-NEXT: or a3, a3, a4 -; ILP32E-FPELIM-NEXT: xor a0, a0, a1 -; ILP32E-FPELIM-NEXT: xor a1, a2, a6 +; ILP32E-FPELIM-NEXT: lw a4, 8(a0) +; ILP32E-FPELIM-NEXT: lw a0, 12(a0) +; ILP32E-FPELIM-NEXT: lw a5, 12(a1) +; ILP32E-FPELIM-NEXT: lw a6, 4(a1) +; ILP32E-FPELIM-NEXT: lw a7, 8(a1) +; ILP32E-FPELIM-NEXT: lw a1, 0(a1) +; ILP32E-FPELIM-NEXT: xor a0, a5, a0 +; ILP32E-FPELIM-NEXT: xor a3, a6, a3 +; ILP32E-FPELIM-NEXT: xor a4, a7, a4 +; ILP32E-FPELIM-NEXT: xor a1, a1, a2 +; ILP32E-FPELIM-NEXT: or a0, a3, a0 +; ILP32E-FPELIM-NEXT: or a1, a1, a4 ; ILP32E-FPELIM-NEXT: or a0, a1, a0 -; ILP32E-FPELIM-NEXT: or a0, a0, a3 ; ILP32E-FPELIM-NEXT: seqz a0, a0 ; ILP32E-FPELIM-NEXT: ret ; @@ -1660,23 +1660,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-WITHFP-NEXT: .cfi_offset s0, -8 ; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 -; ILP32E-WITHFP-NEXT: lw a0, 4(s0) -; ILP32E-WITHFP-NEXT: lw a1, 12(s0) +; ILP32E-WITHFP-NEXT: lw a0, 12(s0) +; ILP32E-WITHFP-NEXT: lw a1, 4(s0) ; ILP32E-WITHFP-NEXT: lw a2, 0(a0) ; ILP32E-WITHFP-NEXT: lw a3, 4(a0) -; ILP32E-WITHFP-NEXT: lw a4, 12(a1) -; ILP32E-WITHFP-NEXT: lw a5, 12(a0) -; ILP32E-WITHFP-NEXT: lw a6, 0(a1) -; ILP32E-WITHFP-NEXT: lw a7, 4(a1) -; ILP32E-WITHFP-NEXT: lw a1, 8(a1) -; ILP32E-WITHFP-NEXT: lw a0, 8(a0) -; ILP32E-WITHFP-NEXT: xor a4, a5, a4 -; ILP32E-WITHFP-NEXT: xor a3, a3, a7 -; ILP32E-WITHFP-NEXT: or a3, a3, a4 -; ILP32E-WITHFP-NEXT: xor a0, a0, a1 -; ILP32E-WITHFP-NEXT: xor a1, a2, a6 +; ILP32E-WITHFP-NEXT: lw a4, 8(a0) +; ILP32E-WITHFP-NEXT: lw a0, 12(a0) +; ILP32E-WITHFP-NEXT: lw a5, 12(a1) +; ILP32E-WITHFP-NEXT: lw a6, 4(a1) +; ILP32E-WITHFP-NEXT: lw a7, 8(a1) +; ILP32E-WITHFP-NEXT: lw a1, 0(a1) +; ILP32E-WITHFP-NEXT: xor a0, a5, a0 +; ILP32E-WITHFP-NEXT: xor a3, a6, a3 +; ILP32E-WITHFP-NEXT: xor a4, a7, a4 +; ILP32E-WITHFP-NEXT: xor a1, a1, a2 +; ILP32E-WITHFP-NEXT: or a0, a3, a0 +; ILP32E-WITHFP-NEXT: or a1, a1, a4 ; ILP32E-WITHFP-NEXT: or a0, a1, a0 -; ILP32E-WITHFP-NEXT: or a0, a0, a3 ; ILP32E-WITHFP-NEXT: seqz a0, a0 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload @@ -1689,23 +1689,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs: ; ILP32E-FPELIM-SAVE-RESTORE: # %bb.0: -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 4(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 12(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 12(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a2, 0(a0) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 4(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 12(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 12(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 0(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 4(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 8(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 8(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a5, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a3, a7 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a3, a3, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a0, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a2, a6 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 8(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 12(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 12(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 4(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 8(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 0(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a5, a0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a6, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a7, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a1, a2 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a3, a0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a1, a1, a4 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a1, a0 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a3 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret ; @@ -1717,23 +1717,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_offset s0, -8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 4(s0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 12(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 12(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 4(s0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a2, 0(a0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 4(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 12(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 12(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 0(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 4(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 8(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 8(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a5, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a3, a7 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a3, a3, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a0, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a2, a6 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 8(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 12(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 12(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 4(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 8(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 0(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a5, a0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a6, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a7, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a1, a2 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a3, a0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a1, a1, a4 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a1, a0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a3 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: tail __riscv_restore_1 @@ -1755,27 +1755,27 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: addi a0, sp, 16 -; ILP32E-FPELIM-NEXT: li a1, 9 -; ILP32E-FPELIM-NEXT: addi a2, sp, 40 -; ILP32E-FPELIM-NEXT: li a3, 7 -; ILP32E-FPELIM-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-NEXT: sw a1, 8(sp) -; ILP32E-FPELIM-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-NEXT: lui a0, 524272 -; ILP32E-FPELIM-NEXT: sw zero, 16(sp) -; ILP32E-FPELIM-NEXT: sw zero, 20(sp) -; ILP32E-FPELIM-NEXT: sw zero, 24(sp) -; ILP32E-FPELIM-NEXT: sw a0, 28(sp) -; ILP32E-FPELIM-NEXT: li a6, 8 +; ILP32E-FPELIM-NEXT: addi a4, sp, 16 +; ILP32E-FPELIM-NEXT: li a5, 9 +; ILP32E-FPELIM-NEXT: addi a6, sp, 40 +; ILP32E-FPELIM-NEXT: li a7, 7 +; ILP32E-FPELIM-NEXT: lui t0, 524272 +; ILP32E-FPELIM-NEXT: li t1, 8 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 2 ; ILP32E-FPELIM-NEXT: li a2, 3 ; ILP32E-FPELIM-NEXT: li a3, 4 +; ILP32E-FPELIM-NEXT: sw a7, 0(sp) +; ILP32E-FPELIM-NEXT: sw a6, 4(sp) +; ILP32E-FPELIM-NEXT: sw a5, 8(sp) +; ILP32E-FPELIM-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-NEXT: li a4, 5 +; ILP32E-FPELIM-NEXT: sw zero, 16(sp) +; ILP32E-FPELIM-NEXT: sw zero, 20(sp) +; ILP32E-FPELIM-NEXT: sw zero, 24(sp) +; ILP32E-FPELIM-NEXT: sw t0, 28(sp) ; ILP32E-FPELIM-NEXT: li a5, 6 -; ILP32E-FPELIM-NEXT: sw a6, 40(sp) +; ILP32E-FPELIM-NEXT: sw t1, 40(sp) ; ILP32E-FPELIM-NEXT: sw zero, 44(sp) ; ILP32E-FPELIM-NEXT: sw zero, 48(sp) ; ILP32E-FPELIM-NEXT: sw zero, 52(sp) @@ -1801,27 +1801,27 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: addi a0, sp, 16 -; ILP32E-WITHFP-NEXT: li a1, 9 -; ILP32E-WITHFP-NEXT: addi a2, sp, 40 -; ILP32E-WITHFP-NEXT: li a3, 7 -; ILP32E-WITHFP-NEXT: sw a3, 0(sp) -; ILP32E-WITHFP-NEXT: sw a2, 4(sp) -; ILP32E-WITHFP-NEXT: sw a1, 8(sp) -; ILP32E-WITHFP-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-NEXT: lui a0, 524272 -; ILP32E-WITHFP-NEXT: sw zero, 16(sp) -; ILP32E-WITHFP-NEXT: sw zero, 20(sp) -; ILP32E-WITHFP-NEXT: sw zero, 24(sp) -; ILP32E-WITHFP-NEXT: sw a0, 28(sp) -; ILP32E-WITHFP-NEXT: li a6, 8 +; ILP32E-WITHFP-NEXT: addi a4, sp, 16 +; ILP32E-WITHFP-NEXT: li a5, 9 +; ILP32E-WITHFP-NEXT: addi a6, sp, 40 +; ILP32E-WITHFP-NEXT: li a7, 7 +; ILP32E-WITHFP-NEXT: lui t0, 524272 +; ILP32E-WITHFP-NEXT: li t1, 8 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 2 ; ILP32E-WITHFP-NEXT: li a2, 3 ; ILP32E-WITHFP-NEXT: li a3, 4 +; ILP32E-WITHFP-NEXT: sw a7, 0(sp) +; ILP32E-WITHFP-NEXT: sw a6, 4(sp) +; ILP32E-WITHFP-NEXT: sw a5, 8(sp) +; ILP32E-WITHFP-NEXT: sw a4, 12(sp) ; ILP32E-WITHFP-NEXT: li a4, 5 +; ILP32E-WITHFP-NEXT: sw zero, 16(sp) +; ILP32E-WITHFP-NEXT: sw zero, 20(sp) +; ILP32E-WITHFP-NEXT: sw zero, 24(sp) +; ILP32E-WITHFP-NEXT: sw t0, 28(sp) ; ILP32E-WITHFP-NEXT: li a5, 6 -; ILP32E-WITHFP-NEXT: sw a6, 40(sp) +; ILP32E-WITHFP-NEXT: sw t1, 40(sp) ; ILP32E-WITHFP-NEXT: sw zero, 44(sp) ; ILP32E-WITHFP-NEXT: sw zero, 48(sp) ; ILP32E-WITHFP-NEXT: sw zero, 52(sp) @@ -1847,27 +1847,27 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, sp, 16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 9 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a2, sp, 40 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 7 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 524272 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 16(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 20(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 24(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 28(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a6, 8 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a4, sp, 16 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 9 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, sp, 40 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a7, 7 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t0, 524272 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t1, 8 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 3 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 0(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 8(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 5 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 16(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 20(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 28(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 6 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 40(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 40(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 44(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 48(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 52(sp) @@ -1889,27 +1889,27 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, sp, 16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 9 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a2, sp, 40 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 7 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 0(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 4(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 524272 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 16(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 20(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 24(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 28(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a6, 8 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a4, sp, 16 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 9 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, sp, 40 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a7, 7 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t0, 524272 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t1, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 3 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 0(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 4(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 8(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 12(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 5 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 16(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 20(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 24(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 28(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 6 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 40(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 40(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 44(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 48(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 52(sp) diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll index b0d60a7aaa235..dabd2a7ce9a73 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll @@ -55,11 +55,11 @@ define i32 @caller_float_in_fpr_exhausted_gprs() nounwind { ; RV32-ILP32FD-NEXT: addi sp, sp, -16 ; RV32-ILP32FD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32FD-NEXT: li a1, 5 -; RV32-ILP32FD-NEXT: lui a0, 265216 -; RV32-ILP32FD-NEXT: fmv.w.x fa0, a0 +; RV32-ILP32FD-NEXT: lui a3, 265216 ; RV32-ILP32FD-NEXT: li a0, 1 ; RV32-ILP32FD-NEXT: li a2, 2 ; RV32-ILP32FD-NEXT: li a4, 3 +; RV32-ILP32FD-NEXT: fmv.w.x fa0, a3 ; RV32-ILP32FD-NEXT: li a6, 4 ; RV32-ILP32FD-NEXT: sw a1, 0(sp) ; RV32-ILP32FD-NEXT: li a1, 0 @@ -96,21 +96,21 @@ define i32 @caller_float_in_gpr_exhausted_fprs() nounwind { ; RV32-ILP32FD-NEXT: addi sp, sp, -16 ; RV32-ILP32FD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32FD-NEXT: lui a0, 260096 +; RV32-ILP32FD-NEXT: lui a1, 262144 ; RV32-ILP32FD-NEXT: fmv.w.x fa0, a0 -; RV32-ILP32FD-NEXT: lui a0, 262144 -; RV32-ILP32FD-NEXT: fmv.w.x fa1, a0 ; RV32-ILP32FD-NEXT: lui a0, 263168 +; RV32-ILP32FD-NEXT: fmv.w.x fa1, a1 +; RV32-ILP32FD-NEXT: lui a1, 264192 ; RV32-ILP32FD-NEXT: fmv.w.x fa2, a0 -; RV32-ILP32FD-NEXT: lui a0, 264192 -; RV32-ILP32FD-NEXT: fmv.w.x fa3, a0 ; RV32-ILP32FD-NEXT: lui a0, 264704 +; RV32-ILP32FD-NEXT: fmv.w.x fa3, a1 +; RV32-ILP32FD-NEXT: lui a1, 265216 ; RV32-ILP32FD-NEXT: fmv.w.x fa4, a0 -; RV32-ILP32FD-NEXT: lui a0, 265216 -; RV32-ILP32FD-NEXT: fmv.w.x fa5, a0 ; RV32-ILP32FD-NEXT: lui a0, 265728 +; RV32-ILP32FD-NEXT: fmv.w.x fa5, a1 +; RV32-ILP32FD-NEXT: lui a1, 266240 ; RV32-ILP32FD-NEXT: fmv.w.x fa6, a0 -; RV32-ILP32FD-NEXT: lui a0, 266240 -; RV32-ILP32FD-NEXT: fmv.w.x fa7, a0 +; RV32-ILP32FD-NEXT: fmv.w.x fa7, a1 ; RV32-ILP32FD-NEXT: lui a0, 266496 ; RV32-ILP32FD-NEXT: call callee_float_in_gpr_exhausted_fprs ; RV32-ILP32FD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -143,24 +143,24 @@ define i32 @caller_float_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ILP32FD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-ILP32FD-NEXT: lui a1, 267520 ; RV32-ILP32FD-NEXT: lui a0, 262144 +; RV32-ILP32FD-NEXT: lui a2, 264192 +; RV32-ILP32FD-NEXT: lui a3, 265216 +; RV32-ILP32FD-NEXT: lui a4, 266240 +; RV32-ILP32FD-NEXT: lui a5, 266496 +; RV32-ILP32FD-NEXT: lui a6, 266752 +; RV32-ILP32FD-NEXT: lui a7, 267008 ; RV32-ILP32FD-NEXT: fmv.w.x fa0, a0 -; RV32-ILP32FD-NEXT: lui a0, 264192 -; RV32-ILP32FD-NEXT: fmv.w.x fa1, a0 -; RV32-ILP32FD-NEXT: lui a0, 265216 -; RV32-ILP32FD-NEXT: fmv.w.x fa2, a0 -; RV32-ILP32FD-NEXT: lui a0, 266240 -; RV32-ILP32FD-NEXT: fmv.w.x fa3, a0 -; RV32-ILP32FD-NEXT: lui a0, 266496 -; RV32-ILP32FD-NEXT: fmv.w.x fa4, a0 -; RV32-ILP32FD-NEXT: lui a0, 266752 -; RV32-ILP32FD-NEXT: fmv.w.x fa5, a0 -; RV32-ILP32FD-NEXT: lui a0, 267008 -; RV32-ILP32FD-NEXT: fmv.w.x fa6, a0 -; RV32-ILP32FD-NEXT: lui a0, 267264 -; RV32-ILP32FD-NEXT: fmv.w.x fa7, a0 +; RV32-ILP32FD-NEXT: lui t0, 267264 +; RV32-ILP32FD-NEXT: fmv.w.x fa1, a2 ; RV32-ILP32FD-NEXT: li a0, 1 +; RV32-ILP32FD-NEXT: fmv.w.x fa2, a3 ; RV32-ILP32FD-NEXT: li a2, 3 +; RV32-ILP32FD-NEXT: fmv.w.x fa3, a4 ; RV32-ILP32FD-NEXT: li a4, 5 +; RV32-ILP32FD-NEXT: fmv.w.x fa4, a5 +; RV32-ILP32FD-NEXT: fmv.w.x fa5, a6 +; RV32-ILP32FD-NEXT: fmv.w.x fa6, a7 +; RV32-ILP32FD-NEXT: fmv.w.x fa7, t0 ; RV32-ILP32FD-NEXT: li a6, 7 ; RV32-ILP32FD-NEXT: sw a1, 0(sp) ; RV32-ILP32FD-NEXT: li a1, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll index cbd2cef981d71..746b71a08a30b 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -52,15 +52,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i128 %d, i32 %e, i32 %f, ; RV64I-NEXT: ld t1, 0(sp) ; RV64I-NEXT: andi a0, a0, 255 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: xor a3, a3, a7 ; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: xor a1, a4, t1 -; RV64I-NEXT: xor a2, a3, a7 -; RV64I-NEXT: or a1, a2, a1 -; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: xor a1, a4, t1 ; RV64I-NEXT: add a0, a0, a6 +; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: add a0, a0, t0 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: ret @@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind { define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind { ; RV64I-LABEL: callee_large_scalars: ; RV64I: # %bb.0: -; RV64I-NEXT: ld a2, 0(a0) -; RV64I-NEXT: ld a3, 8(a0) -; RV64I-NEXT: ld a4, 24(a1) +; RV64I-NEXT: ld a2, 0(a1) +; RV64I-NEXT: ld a3, 8(a1) +; RV64I-NEXT: ld a4, 16(a1) +; RV64I-NEXT: ld a1, 24(a1) ; RV64I-NEXT: ld a5, 24(a0) -; RV64I-NEXT: ld a6, 0(a1) -; RV64I-NEXT: ld a7, 8(a1) -; RV64I-NEXT: ld a1, 16(a1) -; RV64I-NEXT: ld a0, 16(a0) -; RV64I-NEXT: xor a4, a5, a4 -; RV64I-NEXT: xor a3, a3, a7 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: xor a1, a2, a6 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: ld a6, 8(a0) +; RV64I-NEXT: ld a7, 16(a0) +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: xor a1, a5, a1 +; RV64I-NEXT: xor a3, a6, a3 +; RV64I-NEXT: xor a4, a7, a4 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %a, %b @@ -133,15 +133,15 @@ define i64 @caller_large_scalars() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -80 ; RV64I-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a0, 2 -; RV64I-NEXT: sd a0, 0(sp) +; RV64I-NEXT: li a2, 2 +; RV64I-NEXT: li a3, 1 +; RV64I-NEXT: addi a0, sp, 32 +; RV64I-NEXT: mv a1, sp +; RV64I-NEXT: sd a2, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: addi a0, sp, 32 -; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: sd a2, 32(sp) +; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) @@ -163,18 +163,18 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d, ; RV64I-NEXT: ld a0, 8(sp) ; RV64I-NEXT: ld a1, 0(a7) ; RV64I-NEXT: ld a2, 8(a7) -; RV64I-NEXT: ld a3, 24(a0) +; RV64I-NEXT: ld a3, 16(a7) ; RV64I-NEXT: ld a4, 24(a7) -; RV64I-NEXT: ld a5, 0(a0) +; RV64I-NEXT: ld a5, 24(a0) ; RV64I-NEXT: ld a6, 8(a0) -; RV64I-NEXT: ld a0, 16(a0) -; RV64I-NEXT: ld a7, 16(a7) -; RV64I-NEXT: xor a3, a4, a3 +; RV64I-NEXT: ld a7, 16(a0) +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: xor a4, a4, a5 ; RV64I-NEXT: xor a2, a2, a6 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: xor a0, a7, a0 -; RV64I-NEXT: xor a1, a1, a5 -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: xor a3, a3, a7 +; RV64I-NEXT: xor a0, a1, a0 +; RV64I-NEXT: or a2, a2, a4 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret @@ -188,16 +188,10 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -96 ; RV64I-NEXT: sd ra, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a0, sp, 16 -; RV64I-NEXT: li a1, 9 -; RV64I-NEXT: sd a1, 0(sp) -; RV64I-NEXT: sd a0, 8(sp) -; RV64I-NEXT: li a0, 10 -; RV64I-NEXT: sd a0, 16(sp) -; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: sd zero, 32(sp) -; RV64I-NEXT: sd zero, 40(sp) -; RV64I-NEXT: li t0, 8 +; RV64I-NEXT: addi a7, sp, 16 +; RV64I-NEXT: li t0, 9 +; RV64I-NEXT: li t1, 10 +; RV64I-NEXT: li t2, 8 ; RV64I-NEXT: li a0, 1 ; RV64I-NEXT: li a1, 2 ; RV64I-NEXT: li a2, 3 @@ -205,8 +199,14 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: li a5, 6 ; RV64I-NEXT: li a6, 7 +; RV64I-NEXT: sd t0, 0(sp) +; RV64I-NEXT: sd a7, 8(sp) ; RV64I-NEXT: addi a7, sp, 48 -; RV64I-NEXT: sd t0, 48(sp) +; RV64I-NEXT: sd t1, 16(sp) +; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: sd zero, 32(sp) +; RV64I-NEXT: sd zero, 40(sp) +; RV64I-NEXT: sd t2, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) ; RV64I-NEXT: sd zero, 64(sp) ; RV64I-NEXT: sd zero, 72(sp) @@ -356,24 +356,24 @@ define void @caller_aligned_stack() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a0, 12 -; RV64I-NEXT: li a1, 11 -; RV64I-NEXT: sd a1, 40(sp) -; RV64I-NEXT: sd a0, 48(sp) -; RV64I-NEXT: li a6, 10 -; RV64I-NEXT: li t0, 9 -; RV64I-NEXT: li t1, 8 +; RV64I-NEXT: li a6, 12 +; RV64I-NEXT: li a7, 11 +; RV64I-NEXT: li t0, 10 +; RV64I-NEXT: li t1, 9 +; RV64I-NEXT: li t2, 8 ; RV64I-NEXT: li a0, 1 ; RV64I-NEXT: li a1, 2 ; RV64I-NEXT: li a2, 3 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: li a5, 6 +; RV64I-NEXT: sd a7, 40(sp) +; RV64I-NEXT: sd a6, 48(sp) ; RV64I-NEXT: li a7, 7 -; RV64I-NEXT: sd t1, 0(sp) -; RV64I-NEXT: sd t0, 16(sp) +; RV64I-NEXT: sd t2, 0(sp) +; RV64I-NEXT: sd t1, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: sd a6, 32(sp) +; RV64I-NEXT: sd t0, 32(sp) ; RV64I-NEXT: li a6, 0 ; RV64I-NEXT: call callee_aligned_stack ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload @@ -480,15 +480,15 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, 1 ; RV64I-NEXT: li a2, 2 +; RV64I-NEXT: li a3, 3 +; RV64I-NEXT: li a4, 4 ; RV64I-NEXT: sw a1, 0(a0) ; RV64I-NEXT: sw zero, 4(a0) ; RV64I-NEXT: sw a2, 8(a0) ; RV64I-NEXT: sw zero, 12(a0) -; RV64I-NEXT: li a1, 3 -; RV64I-NEXT: li a2, 4 -; RV64I-NEXT: sw a1, 16(a0) +; RV64I-NEXT: sw a3, 16(a0) ; RV64I-NEXT: sw zero, 20(a0) -; RV64I-NEXT: sw a2, 24(a0) +; RV64I-NEXT: sw a4, 24(a0) ; RV64I-NEXT: sw zero, 28(a0) ; RV64I-NEXT: ret store i64 1, ptr %agg.result, align 4 diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll index b26bd7b889807..6608874286e34 100644 --- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll @@ -87,8 +87,8 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si ; NOCMOV-NEXT: seqz a4, a4 ; NOCMOV-NEXT: addi a4, a4, -1 ; NOCMOV-NEXT: and a1, a1, a4 -; NOCMOV-NEXT: xor a0, a0, a1 ; NOCMOV-NEXT: and a3, a3, a4 +; NOCMOV-NEXT: xor a0, a0, a1 ; NOCMOV-NEXT: xor a2, a2, a3 ; NOCMOV-NEXT: addw a0, a0, a2 ; NOCMOV-NEXT: ret @@ -96,16 +96,16 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si ; CMOV-LABEL: test3: ; CMOV: # %bb.0: ; CMOV-NEXT: xor a1, a1, a0 +; CMOV-NEXT: xor a3, a3, a2 ; CMOV-NEXT: bnez a4, .LBB2_2 ; CMOV-NEXT: # %bb.1: ; CMOV-NEXT: mv a1, a0 ; CMOV-NEXT: .LBB2_2: -; CMOV-NEXT: xor a0, a2, a3 ; CMOV-NEXT: bnez a4, .LBB2_4 ; CMOV-NEXT: # %bb.3: -; CMOV-NEXT: mv a0, a2 +; CMOV-NEXT: mv a3, a2 ; CMOV-NEXT: .LBB2_4: -; CMOV-NEXT: addw a0, a0, a1 +; CMOV-NEXT: addw a0, a1, a3 ; CMOV-NEXT: ret ; ; SHORT_FORWARD-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/compress.ll b/llvm/test/CodeGen/RISCV/compress.ll index 8fb520fac41ee..c8803773d7630 100644 --- a/llvm/test/CodeGen/RISCV/compress.ll +++ b/llvm/test/CodeGen/RISCV/compress.ll @@ -32,9 +32,9 @@ define i32 @simple_arith(i32 %a, i32 %b) #0 { ; RV32IC-LABEL: : ; RV32IC: addi a2, a0, 0x1 +; RV32IC-NEXT: c.srai a1, 0x9 ; RV32IC-NEXT: c.andi a2, 0xb ; RV32IC-NEXT: c.slli a2, 0x7 -; RV32IC-NEXT: c.srai a1, 0x9 ; RV32IC-NEXT: sub a0, a1, a0 ; RV32IC-NEXT: c.add a0, a2 ; RV32IC-NEXT: c.jr ra diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll index 1a661fddacfa0..dc81c13bfb6a3 100644 --- a/llvm/test/CodeGen/RISCV/condbinops.ll +++ b/llvm/test/CodeGen/RISCV/condbinops.ll @@ -453,19 +453,19 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND-LABEL: shl64: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 +; RV32ZICOND-NEXT: srli a3, a0, 1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 -; RV32ZICOND-NEXT: sll a3, a0, a2 +; RV32ZICOND-NEXT: sll a0, a0, a2 ; RV32ZICOND-NEXT: addi a4, a2, -32 -; RV32ZICOND-NEXT: slti a4, a4, 0 -; RV32ZICOND-NEXT: czero.nez a5, a3, a4 ; RV32ZICOND-NEXT: sll a1, a1, a2 ; RV32ZICOND-NEXT: not a2, a2 -; RV32ZICOND-NEXT: srli a0, a0, 1 -; RV32ZICOND-NEXT: srl a0, a0, a2 -; RV32ZICOND-NEXT: or a0, a1, a0 -; RV32ZICOND-NEXT: czero.eqz a1, a0, a4 -; RV32ZICOND-NEXT: or a1, a1, a5 -; RV32ZICOND-NEXT: czero.eqz a0, a3, a4 +; RV32ZICOND-NEXT: slti a4, a4, 0 +; RV32ZICOND-NEXT: srl a2, a3, a2 +; RV32ZICOND-NEXT: czero.nez a3, a0, a4 +; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 +; RV32ZICOND-NEXT: or a1, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: shl64: @@ -527,22 +527,22 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND-LABEL: ashr64: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 +; RV32ZICOND-NEXT: slli a3, a1, 1 +; RV32ZICOND-NEXT: srai a5, a1, 31 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 -; RV32ZICOND-NEXT: sra a3, a1, a2 +; RV32ZICOND-NEXT: sra a1, a1, a2 ; RV32ZICOND-NEXT: addi a4, a2, -32 -; RV32ZICOND-NEXT: slti a4, a4, 0 -; RV32ZICOND-NEXT: czero.nez a5, a3, a4 ; RV32ZICOND-NEXT: srl a0, a0, a2 ; RV32ZICOND-NEXT: not a2, a2 -; RV32ZICOND-NEXT: slli a6, a1, 1 -; RV32ZICOND-NEXT: sll a2, a6, a2 +; RV32ZICOND-NEXT: slti a4, a4, 0 +; RV32ZICOND-NEXT: sll a2, a3, a2 +; RV32ZICOND-NEXT: czero.nez a3, a1, a4 ; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 +; RV32ZICOND-NEXT: czero.nez a2, a5, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a5 -; RV32ZICOND-NEXT: czero.eqz a2, a3, a4 -; RV32ZICOND-NEXT: srai a1, a1, 31 -; RV32ZICOND-NEXT: czero.nez a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a2, a1 +; RV32ZICOND-NEXT: or a0, a0, a3 +; RV32ZICOND-NEXT: or a1, a1, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: ashr64: @@ -604,19 +604,19 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND-LABEL: lshr64: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 +; RV32ZICOND-NEXT: slli a3, a1, 1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 -; RV32ZICOND-NEXT: srl a3, a1, a2 +; RV32ZICOND-NEXT: srl a1, a1, a2 ; RV32ZICOND-NEXT: addi a4, a2, -32 -; RV32ZICOND-NEXT: slti a4, a4, 0 -; RV32ZICOND-NEXT: czero.nez a5, a3, a4 ; RV32ZICOND-NEXT: srl a0, a0, a2 ; RV32ZICOND-NEXT: not a2, a2 -; RV32ZICOND-NEXT: slli a1, a1, 1 -; RV32ZICOND-NEXT: sll a1, a1, a2 -; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: slti a4, a4, 0 +; RV32ZICOND-NEXT: sll a2, a3, a2 +; RV32ZICOND-NEXT: czero.nez a3, a1, a4 +; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a5 -; RV32ZICOND-NEXT: czero.eqz a1, a3, a4 +; RV32ZICOND-NEXT: or a0, a0, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: lshr64: @@ -636,10 +636,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: sltu a5, a0, a2 ; RV32I-NEXT: and a3, a4, a3 +; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; @@ -669,10 +669,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 -; RV32ZICOND-NEXT: sltu a5, a0, a2 ; RV32ZICOND-NEXT: czero.eqz a3, a3, a4 +; RV32ZICOND-NEXT: sltu a4, a0, a2 ; RV32ZICOND-NEXT: sub a1, a1, a3 -; RV32ZICOND-NEXT: sub a1, a1, a5 +; RV32ZICOND-NEXT: sub a1, a1, a4 ; RV32ZICOND-NEXT: sub a0, a0, a2 ; RV32ZICOND-NEXT: ret ; @@ -728,8 +728,8 @@ define i64 @and64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND-NEXT: and a3, a1, a3 ; RV32ZICOND-NEXT: and a2, a0, a2 ; RV32ZICOND-NEXT: czero.nez a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a2, a0 ; RV32ZICOND-NEXT: czero.nez a1, a1, a4 +; RV32ZICOND-NEXT: or a0, a2, a0 ; RV32ZICOND-NEXT: or a1, a3, a1 ; RV32ZICOND-NEXT: ret ; @@ -752,8 +752,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a3, a4, a3 -; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: and a2, a4, a2 +; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: add a1, a1, a0 @@ -786,8 +786,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 ; RV32ZICOND-NEXT: czero.eqz a3, a3, a4 -; RV32ZICOND-NEXT: add a1, a1, a3 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 +; RV32ZICOND-NEXT: add a1, a1, a3 ; RV32ZICOND-NEXT: add a2, a0, a2 ; RV32ZICOND-NEXT: sltu a0, a2, a0 ; RV32ZICOND-NEXT: add a1, a1, a0 @@ -812,8 +812,8 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: and a3, a4, a3 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: ret ; @@ -843,9 +843,9 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a4 ; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.eqz a2, a3, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: or64: @@ -865,8 +865,8 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: and a3, a4, a3 +; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: ret ; @@ -896,9 +896,9 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: andi a4, a4, 1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a4 ; RV32ZICOND-NEXT: xor a0, a0, a2 -; RV32ZICOND-NEXT: czero.eqz a2, a3, a4 -; RV32ZICOND-NEXT: xor a1, a1, a2 +; RV32ZICOND-NEXT: xor a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: xor64: diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll index 622365cf13bce..6c2ba493ffcd5 100644 --- a/llvm/test/CodeGen/RISCV/condops.ll +++ b/llvm/test/CodeGen/RISCV/condops.ll @@ -208,8 +208,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: and a4, a0, a4 -; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: sltu a1, a0, a1 ; RV32I-NEXT: add a1, a2, a1 @@ -225,8 +225,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: add1: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a4, a0 -; RV32XVENTANACONDOPS-NEXT: add a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a3, a0 +; RV32XVENTANACONDOPS-NEXT: add a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: add a0, a1, a0 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a0, a1 ; RV32XVENTANACONDOPS-NEXT: add a1, a2, a1 @@ -247,8 +247,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: add1: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a4, a4, a0 -; RV32ZICOND-NEXT: add a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a3, a0 +; RV32ZICOND-NEXT: add a2, a2, a4 ; RV32ZICOND-NEXT: add a0, a1, a0 ; RV32ZICOND-NEXT: sltu a1, a0, a1 ; RV32ZICOND-NEXT: add a1, a2, a1 @@ -269,11 +269,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: add a2, a4, a2 ; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: add a1, a4, a2 ; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: sltu a1, a0, a3 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: add2: @@ -286,11 +286,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: add2: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: add a2, a4, a2 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a1, a0 +; RV32XVENTANACONDOPS-NEXT: add a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: add a0, a3, a0 -; RV32XVENTANACONDOPS-NEXT: sltu a1, a0, a3 -; RV32XVENTANACONDOPS-NEXT: add a1, a2, a1 +; RV32XVENTANACONDOPS-NEXT: sltu a2, a0, a3 +; RV32XVENTANACONDOPS-NEXT: add a1, a1, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: add2: @@ -308,11 +308,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: add2: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 -; RV32ZICOND-NEXT: add a2, a4, a2 ; RV32ZICOND-NEXT: czero.eqz a0, a1, a0 +; RV32ZICOND-NEXT: add a1, a4, a2 ; RV32ZICOND-NEXT: add a0, a3, a0 -; RV32ZICOND-NEXT: sltu a1, a0, a3 -; RV32ZICOND-NEXT: add a1, a2, a1 +; RV32ZICOND-NEXT: sltu a2, a0, a3 +; RV32ZICOND-NEXT: add a1, a1, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: add2: @@ -330,8 +330,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: and a4, a0, a4 -; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: sltu a1, a0, a1 ; RV32I-NEXT: add a1, a2, a1 @@ -347,8 +347,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: add3: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a4, a0 -; RV32XVENTANACONDOPS-NEXT: add a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a3, a0 +; RV32XVENTANACONDOPS-NEXT: add a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: add a0, a1, a0 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a0, a1 ; RV32XVENTANACONDOPS-NEXT: add a1, a2, a1 @@ -369,8 +369,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: add3: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a4, a4, a0 -; RV32ZICOND-NEXT: add a2, a2, a4 ; RV32ZICOND-NEXT: czero.nez a0, a3, a0 +; RV32ZICOND-NEXT: add a2, a2, a4 ; RV32ZICOND-NEXT: add a0, a1, a0 ; RV32ZICOND-NEXT: sltu a1, a0, a1 ; RV32ZICOND-NEXT: add a1, a2, a1 @@ -391,11 +391,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: add a2, a4, a2 ; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: add a1, a4, a2 ; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: sltu a1, a0, a3 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: add4: @@ -408,11 +408,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: add4: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: add a2, a4, a2 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV32XVENTANACONDOPS-NEXT: add a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: add a0, a3, a0 -; RV32XVENTANACONDOPS-NEXT: sltu a1, a0, a3 -; RV32XVENTANACONDOPS-NEXT: add a1, a2, a1 +; RV32XVENTANACONDOPS-NEXT: sltu a2, a0, a3 +; RV32XVENTANACONDOPS-NEXT: add a1, a1, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: add4: @@ -430,11 +430,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: add4: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a2, a2, a0 -; RV32ZICOND-NEXT: add a2, a4, a2 ; RV32ZICOND-NEXT: czero.nez a0, a1, a0 +; RV32ZICOND-NEXT: add a1, a4, a2 ; RV32ZICOND-NEXT: add a0, a3, a0 -; RV32ZICOND-NEXT: sltu a1, a0, a3 -; RV32ZICOND-NEXT: add a1, a2, a1 +; RV32ZICOND-NEXT: sltu a2, a0, a3 +; RV32ZICOND-NEXT: add a1, a1, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: add4: @@ -452,10 +452,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: sltu a4, a1, a3 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a0, a1, a3 ; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: ret @@ -470,10 +470,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: sub1: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: sltu a5, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a0 +; RV32XVENTANACONDOPS-NEXT: sltu a4, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a5 +; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: sub a0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: mv a1, a2 ; RV32XVENTANACONDOPS-NEXT: ret @@ -493,10 +493,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: sub1: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 -; RV32ZICOND-NEXT: sltu a5, a1, a3 ; RV32ZICOND-NEXT: czero.eqz a0, a4, a0 +; RV32ZICOND-NEXT: sltu a4, a1, a3 ; RV32ZICOND-NEXT: sub a2, a2, a0 -; RV32ZICOND-NEXT: sub a2, a2, a5 +; RV32ZICOND-NEXT: sub a2, a2, a4 ; RV32ZICOND-NEXT: sub a0, a1, a3 ; RV32ZICOND-NEXT: mv a1, a2 ; RV32ZICOND-NEXT: ret @@ -516,10 +516,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: sltu a4, a1, a3 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a0, a1, a3 ; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: ret @@ -534,10 +534,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: sub2: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: sltu a5, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a4, a0 +; RV32XVENTANACONDOPS-NEXT: sltu a4, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a5 +; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: sub a0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: mv a1, a2 ; RV32XVENTANACONDOPS-NEXT: ret @@ -557,10 +557,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: sub2: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a3, a3, a0 -; RV32ZICOND-NEXT: sltu a5, a1, a3 ; RV32ZICOND-NEXT: czero.nez a0, a4, a0 +; RV32ZICOND-NEXT: sltu a4, a1, a3 ; RV32ZICOND-NEXT: sub a2, a2, a0 -; RV32ZICOND-NEXT: sub a2, a2, a5 +; RV32ZICOND-NEXT: sub a2, a2, a4 ; RV32ZICOND-NEXT: sub a0, a1, a3 ; RV32ZICOND-NEXT: mv a1, a2 ; RV32ZICOND-NEXT: ret @@ -578,11 +578,11 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: or1: ; RV32I: # %bb.0: -; RV32I-NEXT: neg a5, a0 -; RV32I-NEXT: and a0, a5, a3 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: and a1, a5, a4 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: and a3, a0, a3 +; RV32I-NEXT: and a4, a0, a4 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: or a1, a2, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: or1: @@ -595,10 +595,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: or1: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a2, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: or a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: or1: @@ -616,10 +615,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: or1: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 -; RV32ZICOND-NEXT: or a3, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 -; RV32ZICOND-NEXT: or a1, a2, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.eqz a4, a4, a0 +; RV32ZICOND-NEXT: or a0, a1, a3 +; RV32ZICOND-NEXT: or a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: or1: @@ -635,11 +633,11 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: or2: ; RV32I: # %bb.0: -; RV32I-NEXT: neg a5, a0 -; RV32I-NEXT: and a0, a5, a1 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: and a1, a5, a2 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: or a0, a3, a1 +; RV32I-NEXT: or a1, a4, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: or2: @@ -652,10 +650,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: or2: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: or2: @@ -673,10 +670,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: or2: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a1, a1, a0 -; RV32ZICOND-NEXT: or a3, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a2, a0 -; RV32ZICOND-NEXT: or a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: or a0, a3, a1 +; RV32ZICOND-NEXT: or a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: or2: @@ -692,11 +688,11 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: or3: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a0, -1 -; RV32I-NEXT: and a0, a5, a3 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: and a1, a5, a4 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: and a3, a0, a3 +; RV32I-NEXT: and a4, a0, a4 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: or a1, a2, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: or3: @@ -709,10 +705,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: or3: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a2, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: or a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: or3: @@ -730,10 +725,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: or3: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a3, a3, a0 -; RV32ZICOND-NEXT: or a3, a1, a3 -; RV32ZICOND-NEXT: czero.nez a1, a4, a0 -; RV32ZICOND-NEXT: or a1, a2, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a4, a4, a0 +; RV32ZICOND-NEXT: or a0, a1, a3 +; RV32ZICOND-NEXT: or a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: or3: @@ -749,11 +743,11 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: or4: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a0, -1 -; RV32I-NEXT: and a0, a5, a1 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: and a1, a5, a2 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: or a0, a3, a1 +; RV32I-NEXT: or a1, a4, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: or4: @@ -766,10 +760,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: or4: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: or4: @@ -787,10 +780,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: or4: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a1, a1, a0 -; RV32ZICOND-NEXT: or a3, a3, a1 -; RV32ZICOND-NEXT: czero.nez a1, a2, a0 -; RV32ZICOND-NEXT: or a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: or a0, a3, a1 +; RV32ZICOND-NEXT: or a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: or4: @@ -806,11 +798,11 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: xor1: ; RV32I: # %bb.0: -; RV32I-NEXT: neg a5, a0 -; RV32I-NEXT: and a0, a5, a3 -; RV32I-NEXT: xor a0, a1, a0 -; RV32I-NEXT: and a1, a5, a4 -; RV32I-NEXT: xor a1, a2, a1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: and a3, a0, a3 +; RV32I-NEXT: and a4, a0, a4 +; RV32I-NEXT: xor a0, a1, a3 +; RV32I-NEXT: xor a1, a2, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: xor1: @@ -823,10 +815,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: xor1: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: xor a3, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: xor a1, a2, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: xor a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: xor a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: xor1: @@ -844,10 +835,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: xor1: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 -; RV32ZICOND-NEXT: xor a3, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 -; RV32ZICOND-NEXT: xor a1, a2, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.eqz a4, a4, a0 +; RV32ZICOND-NEXT: xor a0, a1, a3 +; RV32ZICOND-NEXT: xor a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: xor1: @@ -863,11 +853,11 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: xor2: ; RV32I: # %bb.0: -; RV32I-NEXT: neg a5, a0 -; RV32I-NEXT: and a0, a5, a1 -; RV32I-NEXT: xor a0, a3, a0 -; RV32I-NEXT: and a1, a5, a2 -; RV32I-NEXT: xor a1, a4, a1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: xor a0, a3, a1 +; RV32I-NEXT: xor a1, a4, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: xor2: @@ -880,10 +870,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: xor2: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: xor a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: xor a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: xor a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: xor a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: xor2: @@ -901,10 +890,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: xor2: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.eqz a1, a1, a0 -; RV32ZICOND-NEXT: xor a3, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a2, a0 -; RV32ZICOND-NEXT: xor a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: xor a0, a3, a1 +; RV32ZICOND-NEXT: xor a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: xor2: @@ -920,11 +908,11 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: xor3: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a0, -1 -; RV32I-NEXT: and a0, a5, a3 -; RV32I-NEXT: xor a0, a1, a0 -; RV32I-NEXT: and a1, a5, a4 -; RV32I-NEXT: xor a1, a2, a1 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: and a3, a0, a3 +; RV32I-NEXT: and a4, a0, a4 +; RV32I-NEXT: xor a0, a1, a3 +; RV32I-NEXT: xor a1, a2, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: xor3: @@ -937,10 +925,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: xor3: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 -; RV32XVENTANACONDOPS-NEXT: xor a3, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: xor a1, a2, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: xor a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: xor a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: xor3: @@ -958,10 +945,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: xor3: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a3, a3, a0 -; RV32ZICOND-NEXT: xor a3, a1, a3 -; RV32ZICOND-NEXT: czero.nez a1, a4, a0 -; RV32ZICOND-NEXT: xor a1, a2, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a4, a4, a0 +; RV32ZICOND-NEXT: xor a0, a1, a3 +; RV32ZICOND-NEXT: xor a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: xor3: @@ -977,11 +963,11 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: xor4: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a0, -1 -; RV32I-NEXT: and a0, a5, a1 -; RV32I-NEXT: xor a0, a3, a0 -; RV32I-NEXT: and a1, a5, a2 -; RV32I-NEXT: xor a1, a4, a1 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: xor a0, a3, a1 +; RV32I-NEXT: xor a1, a4, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: xor4: @@ -994,10 +980,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: xor4: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: xor a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: xor a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: xor a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: xor a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: xor4: @@ -1015,10 +1000,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: xor4: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a1, a1, a0 -; RV32ZICOND-NEXT: xor a3, a3, a1 -; RV32ZICOND-NEXT: czero.nez a1, a2, a0 -; RV32ZICOND-NEXT: xor a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: xor a0, a3, a1 +; RV32ZICOND-NEXT: xor a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: xor4: @@ -1057,10 +1041,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-NEXT: and a4, a2, a4 ; RV32XVENTANACONDOPS-NEXT: and a3, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: and1: @@ -1082,10 +1065,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-NEXT: and a4, a2, a4 ; RV32ZICOND-NEXT: and a3, a1, a3 ; RV32ZICOND-NEXT: czero.nez a1, a1, a0 -; RV32ZICOND-NEXT: or a3, a3, a1 -; RV32ZICOND-NEXT: czero.nez a1, a2, a0 -; RV32ZICOND-NEXT: or a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: or a0, a3, a1 +; RV32ZICOND-NEXT: or a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: and1: @@ -1122,13 +1104,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; ; RV32XVENTANACONDOPS-LABEL: and2: ; RV32XVENTANACONDOPS: # %bb.0: -; RV32XVENTANACONDOPS-NEXT: and a5, a2, a4 +; RV32XVENTANACONDOPS-NEXT: and a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: and a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a3, a0 -; RV32XVENTANACONDOPS-NEXT: or a2, a1, a2 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: or a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: and2: @@ -1147,13 +1128,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; ; RV32ZICOND-LABEL: and2: ; RV32ZICOND: # %bb.0: -; RV32ZICOND-NEXT: and a5, a2, a4 +; RV32ZICOND-NEXT: and a2, a2, a4 ; RV32ZICOND-NEXT: and a1, a1, a3 -; RV32ZICOND-NEXT: czero.nez a2, a3, a0 -; RV32ZICOND-NEXT: or a2, a1, a2 -; RV32ZICOND-NEXT: czero.nez a1, a4, a0 -; RV32ZICOND-NEXT: or a1, a5, a1 -; RV32ZICOND-NEXT: mv a0, a2 +; RV32ZICOND-NEXT: czero.nez a3, a3, a0 +; RV32ZICOND-NEXT: czero.nez a4, a4, a0 +; RV32ZICOND-NEXT: or a0, a1, a3 +; RV32ZICOND-NEXT: or a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: and2: @@ -1193,10 +1173,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-NEXT: and a4, a2, a4 ; RV32XVENTANACONDOPS-NEXT: and a3, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a2, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a4, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a3, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a2 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: and3: @@ -1218,10 +1197,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-NEXT: and a4, a2, a4 ; RV32ZICOND-NEXT: and a3, a1, a3 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a0 -; RV32ZICOND-NEXT: or a3, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a2, a0 -; RV32ZICOND-NEXT: or a1, a4, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: or a0, a3, a1 +; RV32ZICOND-NEXT: or a1, a4, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: and3: @@ -1258,13 +1236,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; ; RV32XVENTANACONDOPS-LABEL: and4: ; RV32XVENTANACONDOPS: # %bb.0: -; RV32XVENTANACONDOPS-NEXT: and a5, a2, a4 +; RV32XVENTANACONDOPS-NEXT: and a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: and a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a3, a0 -; RV32XVENTANACONDOPS-NEXT: or a2, a1, a2 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: or a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: and4: @@ -1283,13 +1260,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; ; RV32ZICOND-LABEL: and4: ; RV32ZICOND: # %bb.0: -; RV32ZICOND-NEXT: and a5, a2, a4 +; RV32ZICOND-NEXT: and a2, a2, a4 ; RV32ZICOND-NEXT: and a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a2, a3, a0 -; RV32ZICOND-NEXT: or a2, a1, a2 -; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 -; RV32ZICOND-NEXT: or a1, a5, a1 -; RV32ZICOND-NEXT: mv a0, a2 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a4, a0 +; RV32ZICOND-NEXT: or a0, a1, a3 +; RV32ZICOND-NEXT: or a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: and4: @@ -1328,11 +1304,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, a0 -; RV32XVENTANACONDOPS-NEXT: or a3, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: mv a0, a3 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a1, a3 +; RV32XVENTANACONDOPS-NEXT: or a1, a2, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: basic: @@ -1352,11 +1327,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: czero.nez a3, a3, a0 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a0 -; RV32ZICOND-NEXT: or a3, a1, a3 -; RV32ZICOND-NEXT: czero.nez a1, a4, a0 -; RV32ZICOND-NEXT: czero.eqz a0, a2, a0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: czero.nez a4, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: or a0, a1, a3 +; RV32ZICOND-NEXT: or a1, a2, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: basic: @@ -1374,14 +1348,14 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: or a2, a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: beqz a2, .LBB23_2 +; RV32I-NEXT: beqz a1, .LBB23_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a6 -; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: mv a5, a7 ; RV32I-NEXT: .LBB23_2: +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: ret ; ; RV64I-LABEL: seteq: @@ -1397,13 +1371,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor a1, a1, a3 ; RV32XVENTANACONDOPS-NEXT: xor a0, a0, a2 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: seteq: @@ -1425,13 +1399,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor a1, a1, a3 ; RV32ZICOND-NEXT: xor a0, a0, a2 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a6, a1 -; RV32ZICOND-NEXT: czero.nez a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a7, a1 -; RV32ZICOND-NEXT: czero.nez a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a0 +; RV32ZICOND-NEXT: czero.nez a2, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a7, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: seteq: @@ -1451,14 +1425,14 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: or a2, a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: bnez a2, .LBB24_2 +; RV32I-NEXT: bnez a1, .LBB24_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a6 -; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: mv a5, a7 ; RV32I-NEXT: .LBB24_2: +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: ret ; ; RV64I-LABEL: setne: @@ -1474,13 +1448,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor a1, a1, a3 ; RV32XVENTANACONDOPS-NEXT: xor a0, a0, a2 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setne: @@ -1502,13 +1476,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor a1, a1, a3 ; RV32ZICOND-NEXT: xor a0, a0, a2 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a6, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a7, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a6, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a4, a0 +; RV32ZICOND-NEXT: czero.nez a3, a7, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setne: @@ -1555,16 +1529,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: slt a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setgt: @@ -1586,16 +1560,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: slt a1, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a6, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a7, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a6, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a4, a0 +; RV32ZICOND-NEXT: czero.nez a3, a7, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setgt: @@ -1642,16 +1616,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: slt a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setge: @@ -1673,16 +1647,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: slt a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a0, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a6, a1 -; RV32ZICOND-NEXT: czero.nez a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a7, a1 -; RV32ZICOND-NEXT: czero.nez a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a0 +; RV32ZICOND-NEXT: czero.nez a2, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a7, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setge: @@ -1729,16 +1703,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: slt a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setlt: @@ -1760,16 +1734,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: slt a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a0, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a6, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a7, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a6, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a4, a0 +; RV32ZICOND-NEXT: czero.nez a3, a7, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setlt: @@ -1816,16 +1790,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: slt a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setle: @@ -1847,16 +1821,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: slt a1, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a6, a1 -; RV32ZICOND-NEXT: czero.nez a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a7, a1 -; RV32ZICOND-NEXT: czero.nez a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a0 +; RV32ZICOND-NEXT: czero.nez a2, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a7, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setle: @@ -1903,16 +1877,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setugt: @@ -1934,16 +1908,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: sltu a1, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a6, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a7, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a6, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a4, a0 +; RV32ZICOND-NEXT: czero.nez a3, a7, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setugt: @@ -1990,16 +1964,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setuge: @@ -2021,16 +1995,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: sltu a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a0, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a6, a1 -; RV32ZICOND-NEXT: czero.nez a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a7, a1 -; RV32ZICOND-NEXT: czero.nez a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a0 +; RV32ZICOND-NEXT: czero.nez a2, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a7, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setuge: @@ -2077,16 +2051,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a1, a3 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a0, a2 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setult: @@ -2108,16 +2082,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: sltu a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a0, a2 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a6, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a7, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a6, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a4, a0 +; RV32ZICOND-NEXT: czero.nez a3, a7, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setult: @@ -2164,16 +2138,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xor t0, a1, a3 ; RV32XVENTANACONDOPS-NEXT: sltu a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: sltu a0, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a1, t0 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, t0 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a6, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a7, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a5, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a6, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a7, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a4, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setule: @@ -2195,16 +2169,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xor t0, a1, a3 ; RV32ZICOND-NEXT: sltu a1, a3, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: sltu a0, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a1, a1, t0 ; RV32ZICOND-NEXT: czero.nez a0, a0, t0 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a6, a1 -; RV32ZICOND-NEXT: czero.nez a2, a4, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a7, a1 -; RV32ZICOND-NEXT: czero.nez a1, a5, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a6, a0 +; RV32ZICOND-NEXT: czero.nez a2, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a7, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a4, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setule: @@ -2222,14 +2196,14 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_zero: ; RV32I: # %bb.0: -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a6, .LBB33_2 +; RV32I-NEXT: beqz a1, .LBB33_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB33_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: seteq_zero: @@ -2243,13 +2217,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { ; ; RV32XVENTANACONDOPS-LABEL: seteq_zero: ; RV32XVENTANACONDOPS: # %bb.0: -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: seteq_zero: @@ -2267,13 +2241,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { ; ; RV32ZICOND-LABEL: seteq_zero: ; RV32ZICOND: # %bb.0: -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a4, a1 -; RV32ZICOND-NEXT: czero.nez a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a5, a1 -; RV32ZICOND-NEXT: czero.nez a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: czero.nez a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: seteq_zero: @@ -2290,14 +2264,14 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: setne_zero: ; RV32I: # %bb.0: -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB34_2 +; RV32I-NEXT: bnez a1, .LBB34_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB34_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: setne_zero: @@ -2311,13 +2285,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) { ; ; RV32XVENTANACONDOPS-LABEL: setne_zero: ; RV32XVENTANACONDOPS: # %bb.0: -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setne_zero: @@ -2335,13 +2309,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) { ; ; RV32ZICOND-LABEL: setne_zero: ; RV32ZICOND: # %bb.0: -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a4, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a5, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setne_zero: @@ -2359,14 +2333,14 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_constant: ; RV32I: # %bb.0: ; RV32I-NEXT: xori a0, a0, 123 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a6, .LBB35_2 +; RV32I-NEXT: beqz a1, .LBB35_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB35_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: seteq_constant: @@ -2382,13 +2356,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: seteq_constant: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xori a0, a0, 123 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: seteq_constant: @@ -2409,13 +2383,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: seteq_constant: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xori a0, a0, 123 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a4, a1 -; RV32ZICOND-NEXT: czero.nez a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a5, a1 -; RV32ZICOND-NEXT: czero.nez a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: czero.nez a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: seteq_constant: @@ -2434,14 +2408,14 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: setne_constant: ; RV32I: # %bb.0: ; RV32I-NEXT: xori a0, a0, 456 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB36_2 +; RV32I-NEXT: bnez a1, .LBB36_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB36_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: setne_constant: @@ -2457,13 +2431,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: setne_constant: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: xori a0, a0, 456 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setne_constant: @@ -2484,13 +2458,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: setne_constant: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: xori a0, a0, 456 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a4, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a5, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setne_constant: @@ -2509,14 +2483,14 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_2048: ; RV32I: # %bb.0: ; RV32I-NEXT: binvi a0, a0, 11 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a6, .LBB37_2 +; RV32I-NEXT: beqz a1, .LBB37_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB37_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: seteq_2048: @@ -2532,13 +2506,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-LABEL: seteq_2048: ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: binvi a0, a0, 11 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: seteq_2048: @@ -2559,13 +2533,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32ZICOND-LABEL: seteq_2048: ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: binvi a0, a0, 11 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a4, a1 -; RV32ZICOND-NEXT: czero.nez a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a5, a1 -; RV32ZICOND-NEXT: czero.nez a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: czero.nez a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: seteq_2048: @@ -2585,14 +2559,14 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: xori a0, a0, -2048 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a6, .LBB38_2 +; RV32I-NEXT: beqz a1, .LBB38_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB38_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: seteq_neg2048: @@ -2609,13 +2583,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: not a1, a1 ; RV32XVENTANACONDOPS-NEXT: xori a0, a0, -2048 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: seteq_neg2048: @@ -2637,13 +2611,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: not a1, a1 ; RV32ZICOND-NEXT: xori a0, a0, -2048 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.eqz a0, a4, a1 -; RV32ZICOND-NEXT: czero.nez a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.eqz a2, a5, a1 -; RV32ZICOND-NEXT: czero.nez a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a4, a0 +; RV32ZICOND-NEXT: czero.nez a2, a2, a0 +; RV32ZICOND-NEXT: czero.eqz a4, a5, a0 +; RV32ZICOND-NEXT: czero.nez a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: seteq_neg2048: @@ -2663,14 +2637,14 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: xori a0, a0, -2048 -; RV32I-NEXT: or a6, a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a6, .LBB39_2 +; RV32I-NEXT: bnez a1, .LBB39_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB39_2: +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: setne_neg2048: @@ -2687,13 +2661,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS: # %bb.0: ; RV32XVENTANACONDOPS-NEXT: not a1, a1 ; RV32XVENTANACONDOPS-NEXT: xori a0, a0, -2048 -; RV32XVENTANACONDOPS-NEXT: or a1, a0, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a4, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a1 -; RV32XVENTANACONDOPS-NEXT: or a0, a2, a0 -; RV32XVENTANACONDOPS-NEXT: vt.maskcn a2, a5, a1 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a1, a3, a1 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a1 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a4, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskcn a4, a5, a0 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a0 +; RV32XVENTANACONDOPS-NEXT: or a0, a2, a1 +; RV32XVENTANACONDOPS-NEXT: or a1, a3, a4 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setne_neg2048: @@ -2715,13 +2689,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32ZICOND: # %bb.0: ; RV32ZICOND-NEXT: not a1, a1 ; RV32ZICOND-NEXT: xori a0, a0, -2048 -; RV32ZICOND-NEXT: or a1, a0, a1 -; RV32ZICOND-NEXT: czero.nez a0, a4, a1 -; RV32ZICOND-NEXT: czero.eqz a2, a2, a1 -; RV32ZICOND-NEXT: or a0, a2, a0 -; RV32ZICOND-NEXT: czero.nez a2, a5, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a3, a1 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: czero.nez a1, a4, a0 +; RV32ZICOND-NEXT: czero.eqz a2, a2, a0 +; RV32ZICOND-NEXT: czero.nez a4, a5, a0 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a0 +; RV32ZICOND-NEXT: or a0, a2, a1 +; RV32ZICOND-NEXT: or a1, a3, a4 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setne_neg2048: @@ -4097,10 +4071,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-NEXT: feq.s a4, fa0, fa1 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a2, a4 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a0, a4 -; RV32XVENTANACONDOPS-NEXT: or a0, a0, a2 -; RV32XVENTANACONDOPS-NEXT: vt.maskc a2, a3, a4 +; RV32XVENTANACONDOPS-NEXT: vt.maskc a3, a3, a4 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a1, a1, a4 -; RV32XVENTANACONDOPS-NEXT: or a1, a1, a2 +; RV32XVENTANACONDOPS-NEXT: or a0, a0, a2 +; RV32XVENTANACONDOPS-NEXT: or a1, a1, a3 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: setune_64: @@ -4122,10 +4096,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) { ; RV32ZICOND-NEXT: feq.s a4, fa0, fa1 ; RV32ZICOND-NEXT: czero.eqz a2, a2, a4 ; RV32ZICOND-NEXT: czero.nez a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.eqz a2, a3, a4 +; RV32ZICOND-NEXT: czero.eqz a3, a3, a4 ; RV32ZICOND-NEXT: czero.nez a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: setune_64: diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll index 3b376626a783d..53de36f1699a9 100644 --- a/llvm/test/CodeGen/RISCV/copysign-casts.ll +++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll @@ -45,8 +45,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind { ; RV32I-LABEL: fold_promote_d_s: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a3, 524288 -; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret @@ -54,9 +54,9 @@ define double @fold_promote_d_s(double %a, float %b) nounwind { ; RV64I-LABEL: fold_promote_d_s: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 524288 +; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -65,8 +65,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind { ; RV32IF: # %bb.0: ; RV32IF-NEXT: fmv.x.w a2, fa0 ; RV32IF-NEXT: lui a3, 524288 -; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: slli a1, a1, 1 +; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: srli a1, a1, 1 ; RV32IF-NEXT: or a1, a1, a2 ; RV32IF-NEXT: ret @@ -87,8 +87,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind { ; RV32IFZFH: # %bb.0: ; RV32IFZFH-NEXT: fmv.x.w a2, fa0 ; RV32IFZFH-NEXT: lui a3, 524288 -; RV32IFZFH-NEXT: and a2, a2, a3 ; RV32IFZFH-NEXT: slli a1, a1, 1 +; RV32IFZFH-NEXT: and a2, a2, a3 ; RV32IFZFH-NEXT: srli a1, a1, 1 ; RV32IFZFH-NEXT: or a1, a1, a2 ; RV32IFZFH-NEXT: ret @@ -109,8 +109,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind { ; RV32IFZFHMIN: # %bb.0: ; RV32IFZFHMIN-NEXT: fmv.x.w a2, fa0 ; RV32IFZFHMIN-NEXT: lui a3, 524288 -; RV32IFZFHMIN-NEXT: and a2, a2, a3 ; RV32IFZFHMIN-NEXT: slli a1, a1, 1 +; RV32IFZFHMIN-NEXT: and a2, a2, a3 ; RV32IFZFHMIN-NEXT: srli a1, a1, 1 ; RV32IFZFHMIN-NEXT: or a1, a1, a2 ; RV32IFZFHMIN-NEXT: ret @@ -147,9 +147,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind { ; RV32I-LABEL: fold_promote_d_h: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a3, 8 +; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: slli a2, a2, 16 -; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret @@ -157,9 +157,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind { ; RV64I-LABEL: fold_promote_d_h: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 8 +; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a1, a1, 48 -; RV64I-NEXT: slli a0, a0, 1 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -168,9 +168,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind { ; RV32IF: # %bb.0: ; RV32IF-NEXT: fmv.x.w a2, fa0 ; RV32IF-NEXT: lui a3, 8 +; RV32IF-NEXT: slli a1, a1, 1 ; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: slli a2, a2, 16 -; RV32IF-NEXT: slli a1, a1, 1 ; RV32IF-NEXT: srli a1, a1, 1 ; RV32IF-NEXT: or a1, a1, a2 ; RV32IF-NEXT: ret @@ -209,9 +209,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind { ; RV32IFZFH: # %bb.0: ; RV32IFZFH-NEXT: fmv.x.h a2, fa0 ; RV32IFZFH-NEXT: lui a3, 8 +; RV32IFZFH-NEXT: slli a1, a1, 1 ; RV32IFZFH-NEXT: and a2, a2, a3 ; RV32IFZFH-NEXT: slli a2, a2, 16 -; RV32IFZFH-NEXT: slli a1, a1, 1 ; RV32IFZFH-NEXT: srli a1, a1, 1 ; RV32IFZFH-NEXT: or a1, a1, a2 ; RV32IFZFH-NEXT: ret @@ -232,9 +232,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind { ; RV32IFZFHMIN: # %bb.0: ; RV32IFZFHMIN-NEXT: fmv.x.h a2, fa0 ; RV32IFZFHMIN-NEXT: lui a3, 8 +; RV32IFZFHMIN-NEXT: slli a1, a1, 1 ; RV32IFZFHMIN-NEXT: and a2, a2, a3 ; RV32IFZFHMIN-NEXT: slli a2, a2, 16 -; RV32IFZFHMIN-NEXT: slli a1, a1, 1 ; RV32IFZFHMIN-NEXT: srli a1, a1, 1 ; RV32IFZFHMIN-NEXT: or a1, a1, a2 ; RV32IFZFHMIN-NEXT: ret @@ -292,9 +292,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind { ; RV32I-LABEL: fold_promote_f_h: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 8 +; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: slli a0, a0, 1 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -302,9 +302,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind { ; RV64I-LABEL: fold_promote_f_h: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 8 +; RV64I-NEXT: slli a0, a0, 33 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slliw a1, a1, 16 -; RV64I-NEXT: slli a0, a0, 33 ; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -423,8 +423,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind { ; RV32I-LABEL: fold_demote_s_d: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: and a1, a2, a1 ; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a1, a2, a1 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -432,8 +432,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind { ; RV64I-LABEL: fold_demote_s_d: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: srli a1, a1, 63 +; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a1 @@ -515,9 +515,9 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32I-LABEL: fold_demote_h_s: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: slli a0, a0, 17 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a1, a1, 16 -; RV32I-NEXT: slli a0, a0, 17 ; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -525,8 +525,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV64I-LABEL: fold_demote_h_s: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a1, 31 -; RV64I-NEXT: slli a1, a1, 15 ; RV64I-NEXT: slli a0, a0, 49 +; RV64I-NEXT: slli a1, a1, 15 ; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -537,8 +537,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32IF-NEXT: fmv.x.w a1, fa1 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: and a1, a1, a2 -; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: slli a0, a0, 17 +; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: srli a0, a0, 17 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: lui a1, 1048560 @@ -552,8 +552,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32IFD-NEXT: fmv.x.w a1, fa1 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: and a1, a1, a2 -; RV32IFD-NEXT: srli a1, a1, 16 ; RV32IFD-NEXT: slli a0, a0, 17 +; RV32IFD-NEXT: srli a1, a1, 16 ; RV32IFD-NEXT: srli a0, a0, 17 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: lui a1, 1048560 @@ -567,8 +567,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV64IFD-NEXT: fmv.x.w a1, fa1 ; RV64IFD-NEXT: lui a2, 524288 ; RV64IFD-NEXT: and a1, a1, a2 -; RV64IFD-NEXT: srli a1, a1, 16 ; RV64IFD-NEXT: slli a0, a0, 49 +; RV64IFD-NEXT: srli a1, a1, 16 ; RV64IFD-NEXT: srli a0, a0, 49 ; RV64IFD-NEXT: or a0, a0, a1 ; RV64IFD-NEXT: lui a1, 1048560 @@ -597,10 +597,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32IFZFHMIN-LABEL: fold_demote_h_s: ; RV32IFZFHMIN: # %bb.0: ; RV32IFZFHMIN-NEXT: fmv.x.w a0, fa1 -; RV32IFZFHMIN-NEXT: srli a0, a0, 31 -; RV32IFZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV32IFZFHMIN-NEXT: srli a0, a0, 31 ; RV32IFZFHMIN-NEXT: slli a1, a1, 17 +; RV32IFZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFZFHMIN-NEXT: srli a1, a1, 17 ; RV32IFZFHMIN-NEXT: or a0, a1, a0 ; RV32IFZFHMIN-NEXT: fmv.h.x fa0, a0 @@ -609,10 +609,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32IFDZFHMIN-LABEL: fold_demote_h_s: ; RV32IFDZFHMIN: # %bb.0: ; RV32IFDZFHMIN-NEXT: fmv.x.w a0, fa1 -; RV32IFDZFHMIN-NEXT: srli a0, a0, 31 -; RV32IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFDZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV32IFDZFHMIN-NEXT: srli a0, a0, 31 ; RV32IFDZFHMIN-NEXT: slli a1, a1, 17 +; RV32IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFDZFHMIN-NEXT: srli a1, a1, 17 ; RV32IFDZFHMIN-NEXT: or a0, a1, a0 ; RV32IFDZFHMIN-NEXT: fmv.h.x fa0, a0 @@ -621,10 +621,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV64IFDZFHMIN-LABEL: fold_demote_h_s: ; RV64IFDZFHMIN: # %bb.0: ; RV64IFDZFHMIN-NEXT: fmv.x.w a0, fa1 -; RV64IFDZFHMIN-NEXT: srli a0, a0, 31 -; RV64IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV64IFDZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64IFDZFHMIN-NEXT: srli a0, a0, 31 ; RV64IFDZFHMIN-NEXT: slli a1, a1, 49 +; RV64IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV64IFDZFHMIN-NEXT: srli a1, a1, 49 ; RV64IFDZFHMIN-NEXT: or a0, a1, a0 ; RV64IFDZFHMIN-NEXT: fmv.h.x fa0, a0 @@ -635,11 +635,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV32IZDINX-NEXT: # kill: def $x11_w killed $x11_w def $x11 ; RV32IZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV32IZDINX-NEXT: lui a2, 524288 -; RV32IZDINX-NEXT: and a1, a1, a2 -; RV32IZDINX-NEXT: srli a1, a1, 16 ; RV32IZDINX-NEXT: slli a0, a0, 17 -; RV32IZDINX-NEXT: srli a0, a0, 17 +; RV32IZDINX-NEXT: and a1, a1, a2 ; RV32IZDINX-NEXT: lui a2, 1048560 +; RV32IZDINX-NEXT: srli a0, a0, 17 +; RV32IZDINX-NEXT: srli a1, a1, 16 ; RV32IZDINX-NEXT: or a0, a0, a2 ; RV32IZDINX-NEXT: or a0, a0, a1 ; RV32IZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 @@ -650,11 +650,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind { ; RV64IZDINX-NEXT: # kill: def $x11_w killed $x11_w def $x11 ; RV64IZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV64IZDINX-NEXT: lui a2, 524288 -; RV64IZDINX-NEXT: and a1, a1, a2 -; RV64IZDINX-NEXT: srli a1, a1, 16 ; RV64IZDINX-NEXT: slli a0, a0, 49 -; RV64IZDINX-NEXT: srli a0, a0, 49 +; RV64IZDINX-NEXT: and a1, a1, a2 ; RV64IZDINX-NEXT: lui a2, 1048560 +; RV64IZDINX-NEXT: srli a0, a0, 49 +; RV64IZDINX-NEXT: srli a1, a1, 16 ; RV64IZDINX-NEXT: or a0, a0, a2 ; RV64IZDINX-NEXT: or a0, a0, a1 ; RV64IZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 @@ -668,9 +668,9 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32I-LABEL: fold_demote_h_d: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: slli a0, a0, 17 ; RV32I-NEXT: and a1, a2, a1 ; RV32I-NEXT: srli a1, a1, 16 -; RV32I-NEXT: slli a0, a0, 17 ; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -678,8 +678,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV64I-LABEL: fold_demote_h_d: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 49 -; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: srli a1, a1, 63 +; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: or a0, a0, a1 @@ -690,8 +690,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IF-NEXT: fmv.x.w a0, fa0 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: and a1, a1, a2 -; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: slli a0, a0, 17 +; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: srli a0, a0, 17 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: lui a1, 1048560 @@ -707,10 +707,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IFD-NEXT: fmv.x.w a1, fa0 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: and a0, a0, a2 -; RV32IFD-NEXT: srli a0, a0, 16 +; RV32IFD-NEXT: lui a2, 1048560 ; RV32IFD-NEXT: slli a1, a1, 17 ; RV32IFD-NEXT: srli a1, a1, 17 -; RV32IFD-NEXT: lui a2, 1048560 +; RV32IFD-NEXT: srli a0, a0, 16 ; RV32IFD-NEXT: or a1, a1, a2 ; RV32IFD-NEXT: or a0, a1, a0 ; RV32IFD-NEXT: fmv.w.x fa0, a0 @@ -721,12 +721,12 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV64IFD: # %bb.0: ; RV64IFD-NEXT: fmv.x.d a0, fa1 ; RV64IFD-NEXT: fmv.x.w a1, fa0 +; RV64IFD-NEXT: lui a2, 1048560 ; RV64IFD-NEXT: slli a1, a1, 49 -; RV64IFD-NEXT: srli a1, a1, 49 ; RV64IFD-NEXT: srli a0, a0, 63 +; RV64IFD-NEXT: srli a1, a1, 49 ; RV64IFD-NEXT: slli a0, a0, 63 ; RV64IFD-NEXT: srli a0, a0, 48 -; RV64IFD-NEXT: lui a2, 1048560 ; RV64IFD-NEXT: or a1, a1, a2 ; RV64IFD-NEXT: or a0, a1, a0 ; RV64IFD-NEXT: fmv.w.x fa0, a0 @@ -754,8 +754,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IFZFHMIN-LABEL: fold_demote_h_d: ; RV32IFZFHMIN: # %bb.0: ; RV32IFZFHMIN-NEXT: srli a1, a1, 31 -; RV32IFZFHMIN-NEXT: slli a1, a1, 15 ; RV32IFZFHMIN-NEXT: fmv.x.h a0, fa0 +; RV32IFZFHMIN-NEXT: slli a1, a1, 15 ; RV32IFZFHMIN-NEXT: slli a0, a0, 17 ; RV32IFZFHMIN-NEXT: srli a0, a0, 17 ; RV32IFZFHMIN-NEXT: or a0, a0, a1 @@ -767,10 +767,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IFDZFHMIN-NEXT: addi sp, sp, -16 ; RV32IFDZFHMIN-NEXT: fsd fa1, 8(sp) ; RV32IFDZFHMIN-NEXT: lw a0, 12(sp) -; RV32IFDZFHMIN-NEXT: srli a0, a0, 31 -; RV32IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFDZFHMIN-NEXT: fmv.x.h a1, fa0 ; RV32IFDZFHMIN-NEXT: slli a1, a1, 17 +; RV32IFDZFHMIN-NEXT: srli a0, a0, 31 +; RV32IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV32IFDZFHMIN-NEXT: srli a1, a1, 17 ; RV32IFDZFHMIN-NEXT: or a0, a1, a0 ; RV32IFDZFHMIN-NEXT: fmv.h.x fa0, a0 @@ -780,10 +780,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV64IFDZFHMIN-LABEL: fold_demote_h_d: ; RV64IFDZFHMIN: # %bb.0: ; RV64IFDZFHMIN-NEXT: fmv.x.d a0, fa1 -; RV64IFDZFHMIN-NEXT: srli a0, a0, 63 -; RV64IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV64IFDZFHMIN-NEXT: fmv.x.h a1, fa0 +; RV64IFDZFHMIN-NEXT: srli a0, a0, 63 ; RV64IFDZFHMIN-NEXT: slli a1, a1, 49 +; RV64IFDZFHMIN-NEXT: slli a0, a0, 15 ; RV64IFDZFHMIN-NEXT: srli a1, a1, 49 ; RV64IFDZFHMIN-NEXT: or a0, a1, a0 ; RV64IFDZFHMIN-NEXT: fmv.h.x fa0, a0 @@ -793,11 +793,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IZDINX: # %bb.0: ; RV32IZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV32IZDINX-NEXT: lui a1, 524288 -; RV32IZDINX-NEXT: and a1, a2, a1 -; RV32IZDINX-NEXT: srli a1, a1, 16 ; RV32IZDINX-NEXT: slli a0, a0, 17 -; RV32IZDINX-NEXT: srli a0, a0, 17 +; RV32IZDINX-NEXT: and a1, a2, a1 ; RV32IZDINX-NEXT: lui a2, 1048560 +; RV32IZDINX-NEXT: srli a0, a0, 17 +; RV32IZDINX-NEXT: srli a1, a1, 16 ; RV32IZDINX-NEXT: or a0, a0, a2 ; RV32IZDINX-NEXT: or a0, a0, a1 ; RV32IZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 @@ -807,11 +807,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV64IZDINX: # %bb.0: ; RV64IZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV64IZDINX-NEXT: slli a0, a0, 49 -; RV64IZDINX-NEXT: srli a0, a0, 49 ; RV64IZDINX-NEXT: srli a1, a1, 63 +; RV64IZDINX-NEXT: lui a2, 1048560 +; RV64IZDINX-NEXT: srli a0, a0, 49 ; RV64IZDINX-NEXT: slli a1, a1, 63 ; RV64IZDINX-NEXT: srli a1, a1, 48 -; RV64IZDINX-NEXT: lui a2, 1048560 ; RV64IZDINX-NEXT: or a0, a0, a2 ; RV64IZDINX-NEXT: or a0, a0, a1 ; RV64IZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 2c691a2de4c4d..da97ac0d74237 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -131,18 +131,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: # %bb.1: # %cond.false ; RV32_NOZBB-NEXT: addi a1, a0, -1 ; RV32_NOZBB-NEXT: not a0, a0 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: srli a1, a0, 1 ; RV32_NOZBB-NEXT: lui a2, 5 -; RV32_NOZBB-NEXT: addi a2, a2, 1365 -; RV32_NOZBB-NEXT: and a1, a1, a2 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: addi a1, a2, 1365 +; RV32_NOZBB-NEXT: srli a2, a0, 1 +; RV32_NOZBB-NEXT: and a1, a2, a1 +; RV32_NOZBB-NEXT: lui a2, 3 +; RV32_NOZBB-NEXT: addi a2, a2, 819 ; RV32_NOZBB-NEXT: sub a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 3 -; RV32_NOZBB-NEXT: addi a1, a1, 819 -; RV32_NOZBB-NEXT: and a2, a0, a1 +; RV32_NOZBB-NEXT: and a1, a0, a2 ; RV32_NOZBB-NEXT: srli a0, a0, 2 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: add a0, a2, a0 +; RV32_NOZBB-NEXT: and a0, a0, a2 +; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 ; RV32_NOZBB-NEXT: andi a1, a0, 15 @@ -161,18 +161,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind { ; RV64NOZBB-NEXT: # %bb.1: # %cond.false ; RV64NOZBB-NEXT: addi a1, a0, -1 ; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: addiw a1, a2, 1365 +; RV64NOZBB-NEXT: srli a2, a0, 1 +; RV64NOZBB-NEXT: and a1, a2, a1 +; RV64NOZBB-NEXT: lui a2, 3 +; RV64NOZBB-NEXT: addiw a2, a2, 819 ; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 +; RV64NOZBB-NEXT: and a1, a0, a2 ; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 +; RV64NOZBB-NEXT: and a0, a0, a2 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 15 @@ -620,18 +620,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: addi a1, a0, -1 ; RV32_NOZBB-NEXT: not a0, a0 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: srli a1, a0, 1 ; RV32_NOZBB-NEXT: lui a2, 5 -; RV32_NOZBB-NEXT: addi a2, a2, 1365 -; RV32_NOZBB-NEXT: and a1, a1, a2 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: addi a1, a2, 1365 +; RV32_NOZBB-NEXT: srli a2, a0, 1 +; RV32_NOZBB-NEXT: and a1, a2, a1 +; RV32_NOZBB-NEXT: lui a2, 3 +; RV32_NOZBB-NEXT: addi a2, a2, 819 ; RV32_NOZBB-NEXT: sub a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 3 -; RV32_NOZBB-NEXT: addi a1, a1, 819 -; RV32_NOZBB-NEXT: and a2, a0, a1 +; RV32_NOZBB-NEXT: and a1, a0, a2 ; RV32_NOZBB-NEXT: srli a0, a0, 2 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: add a0, a2, a0 +; RV32_NOZBB-NEXT: and a0, a0, a2 +; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 ; RV32_NOZBB-NEXT: andi a1, a0, 15 @@ -644,18 +644,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: addi a1, a0, -1 ; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: addiw a1, a2, 1365 +; RV64NOZBB-NEXT: srli a2, a0, 1 +; RV64NOZBB-NEXT: and a1, a2, a1 +; RV64NOZBB-NEXT: lui a2, 3 +; RV64NOZBB-NEXT: addiw a2, a2, 819 ; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 +; RV64NOZBB-NEXT: and a1, a0, a2 ; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 +; RV64NOZBB-NEXT: and a0, a0, a2 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 15 @@ -1052,28 +1052,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: beqz a1, .LBB9_2 ; RV32_NOZBB-NEXT: # %bb.1: # %cond.false ; RV32_NOZBB-NEXT: srli a1, a1, 17 +; RV32_NOZBB-NEXT: lui a2, 5 ; RV32_NOZBB-NEXT: or a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 16 -; RV32_NOZBB-NEXT: srli a1, a1, 18 -; RV32_NOZBB-NEXT: or a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 16 -; RV32_NOZBB-NEXT: srli a1, a1, 20 -; RV32_NOZBB-NEXT: or a0, a0, a1 -; RV32_NOZBB-NEXT: slli a1, a0, 16 -; RV32_NOZBB-NEXT: srli a1, a1, 24 -; RV32_NOZBB-NEXT: or a0, a0, a1 +; RV32_NOZBB-NEXT: addi a1, a2, 1365 +; RV32_NOZBB-NEXT: slli a2, a0, 16 +; RV32_NOZBB-NEXT: srli a2, a2, 18 +; RV32_NOZBB-NEXT: or a0, a0, a2 +; RV32_NOZBB-NEXT: slli a2, a0, 16 +; RV32_NOZBB-NEXT: srli a2, a2, 20 +; RV32_NOZBB-NEXT: or a0, a0, a2 +; RV32_NOZBB-NEXT: slli a2, a0, 16 +; RV32_NOZBB-NEXT: srli a2, a2, 24 +; RV32_NOZBB-NEXT: or a0, a0, a2 ; RV32_NOZBB-NEXT: not a0, a0 -; RV32_NOZBB-NEXT: srli a1, a0, 1 -; RV32_NOZBB-NEXT: lui a2, 5 -; RV32_NOZBB-NEXT: addi a2, a2, 1365 -; RV32_NOZBB-NEXT: and a1, a1, a2 +; RV32_NOZBB-NEXT: srli a2, a0, 1 +; RV32_NOZBB-NEXT: and a1, a2, a1 +; RV32_NOZBB-NEXT: lui a2, 3 +; RV32_NOZBB-NEXT: addi a2, a2, 819 ; RV32_NOZBB-NEXT: sub a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 3 -; RV32_NOZBB-NEXT: addi a1, a1, 819 -; RV32_NOZBB-NEXT: and a2, a0, a1 +; RV32_NOZBB-NEXT: and a1, a0, a2 ; RV32_NOZBB-NEXT: srli a0, a0, 2 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: add a0, a2, a0 +; RV32_NOZBB-NEXT: and a0, a0, a2 +; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 ; RV32_NOZBB-NEXT: andi a1, a0, 15 @@ -1091,28 +1091,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind { ; RV64NOZBB-NEXT: beqz a1, .LBB9_2 ; RV64NOZBB-NEXT: # %bb.1: # %cond.false ; RV64NOZBB-NEXT: srli a1, a1, 49 +; RV64NOZBB-NEXT: lui a2, 5 ; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 50 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 52 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 56 -; RV64NOZBB-NEXT: or a0, a0, a1 +; RV64NOZBB-NEXT: addiw a1, a2, 1365 +; RV64NOZBB-NEXT: slli a2, a0, 48 +; RV64NOZBB-NEXT: srli a2, a2, 50 +; RV64NOZBB-NEXT: or a0, a0, a2 +; RV64NOZBB-NEXT: slli a2, a0, 48 +; RV64NOZBB-NEXT: srli a2, a2, 52 +; RV64NOZBB-NEXT: or a0, a0, a2 +; RV64NOZBB-NEXT: slli a2, a0, 48 +; RV64NOZBB-NEXT: srli a2, a2, 56 +; RV64NOZBB-NEXT: or a0, a0, a2 ; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 +; RV64NOZBB-NEXT: srli a2, a0, 1 +; RV64NOZBB-NEXT: and a1, a2, a1 +; RV64NOZBB-NEXT: lui a2, 3 +; RV64NOZBB-NEXT: addiw a2, a2, 819 ; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 +; RV64NOZBB-NEXT: and a1, a0, a2 ; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 +; RV64NOZBB-NEXT: and a0, a0, a2 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 15 @@ -1161,31 +1161,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: beqz a0, .LBB10_2 ; RV32I-NEXT: # %bb.1: # %cond.false ; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 16 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -1203,31 +1203,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64I-NEXT: beqz a1, .LBB10_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -1244,33 +1244,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV32M-NEXT: beqz a0, .LBB10_2 ; RV32M-NEXT: # %bb.1: # %cond.false ; RV32M-NEXT: srli a1, a0, 1 +; RV32M-NEXT: lui a2, 349525 ; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: addi a1, a2, 1365 +; RV32M-NEXT: srli a2, a0, 2 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 4 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 8 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 16 +; RV32M-NEXT: or a0, a0, a2 ; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: addi a2, a2, 1365 -; RV32M-NEXT: and a1, a1, a2 +; RV32M-NEXT: srli a2, a0, 1 +; RV32M-NEXT: and a1, a2, a1 +; RV32M-NEXT: lui a2, 209715 +; RV32M-NEXT: addi a2, a2, 819 ; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: lui a1, 209715 -; RV32M-NEXT: addi a1, a1, 819 -; RV32M-NEXT: and a2, a0, a1 +; RV32M-NEXT: and a1, a0, a2 ; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: add a0, a2, a0 +; RV32M-NEXT: and a0, a0, a2 +; RV32M-NEXT: lui a2, 61681 +; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 61681 -; RV32M-NEXT: addi a1, a1, -241 -; RV32M-NEXT: and a0, a0, a1 ; RV32M-NEXT: lui a1, 4112 +; RV32M-NEXT: addi a2, a2, -241 +; RV32M-NEXT: and a0, a0, a2 ; RV32M-NEXT: addi a1, a1, 257 ; RV32M-NEXT: mul a0, a0, a1 ; RV32M-NEXT: srli a0, a0, 24 @@ -1285,33 +1285,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64M-NEXT: beqz a1, .LBB10_2 ; RV64M-NEXT: # %bb.1: # %cond.false ; RV64M-NEXT: srliw a1, a0, 1 +; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 2 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 4 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 8 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 16 -; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: addiw a1, a2, 1365 +; RV64M-NEXT: srliw a2, a0, 2 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 4 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 8 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 16 +; RV64M-NEXT: or a0, a0, a2 ; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: srli a2, a0, 1 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: lui a2, 209715 +; RV64M-NEXT: addiw a2, a2, 819 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 61681 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 -; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 4112 +; RV64M-NEXT: addi a2, a2, -241 +; RV64M-NEXT: and a0, a0, a2 ; RV64M-NEXT: addi a1, a1, 257 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srliw a0, a0, 24 @@ -1349,11 +1349,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: test_ctlz_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: bnez a1, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a1, a0, 1 @@ -1420,40 +1420,40 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB11_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: addiw a2, a3, 819 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -1469,13 +1469,13 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32M-LABEL: test_ctlz_i64: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a2, 349525 +; RV32M-NEXT: lui a3, 209715 +; RV32M-NEXT: lui a6, 61681 +; RV32M-NEXT: lui a7, 4112 ; RV32M-NEXT: addi a5, a2, 1365 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a4, a2, 819 -; RV32M-NEXT: lui a2, 61681 -; RV32M-NEXT: addi a2, a2, -241 -; RV32M-NEXT: lui a3, 4112 -; RV32M-NEXT: addi a3, a3, 257 +; RV32M-NEXT: addi a4, a3, 819 +; RV32M-NEXT: addi a3, a6, -241 +; RV32M-NEXT: addi a2, a7, 257 ; RV32M-NEXT: bnez a1, .LBB11_2 ; RV32M-NEXT: # %bb.1: ; RV32M-NEXT: srli a1, a0, 1 @@ -1498,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: mul a0, a0, a3 +; RV32M-NEXT: and a0, a0, a3 +; RV32M-NEXT: mul a0, a0, a2 ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: addi a0, a0, 32 ; RV32M-NEXT: li a1, 0 @@ -1525,8 +1525,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: mul a0, a0, a3 +; RV32M-NEXT: and a0, a0, a3 +; RV32M-NEXT: mul a0, a0, a2 ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret @@ -1536,44 +1536,44 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV64M-NEXT: beqz a0, .LBB11_2 ; RV64M-NEXT: # %bb.1: # %cond.false ; RV64M-NEXT: srli a1, a0, 1 +; RV64M-NEXT: lui a2, 349525 +; RV64M-NEXT: lui a3, 209715 +; RV64M-NEXT: lui a4, 61681 ; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 2 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 8 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 16 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 32 -; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: addiw a1, a2, 1365 +; RV64M-NEXT: addiw a2, a3, 819 +; RV64M-NEXT: addiw a3, a4, -241 +; RV64M-NEXT: srli a4, a0, 2 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: slli a4, a1, 32 +; RV64M-NEXT: add a1, a1, a4 +; RV64M-NEXT: slli a4, a2, 32 +; RV64M-NEXT: add a2, a2, a4 +; RV64M-NEXT: slli a4, a3, 32 +; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: srli a4, a0, 4 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 8 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 16 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 32 +; RV64M-NEXT: or a0, a0, a4 ; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: slli a3, a2, 32 -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: srli a4, a0, 1 +; RV64M-NEXT: and a1, a4, a1 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 4112 +; RV64M-NEXT: addiw a2, a2, 257 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addiw a1, a1, -241 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a1, a2, 32 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: add a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -1700,7 +1700,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-LABEL: test_ctlz_i16_zero_undef: ; RV32_NOZBB: # %bb.0: ; RV32_NOZBB-NEXT: slli a1, a0, 16 +; RV32_NOZBB-NEXT: lui a2, 5 ; RV32_NOZBB-NEXT: srli a1, a1, 17 +; RV32_NOZBB-NEXT: addi a2, a2, 1365 ; RV32_NOZBB-NEXT: or a0, a0, a1 ; RV32_NOZBB-NEXT: slli a1, a0, 16 ; RV32_NOZBB-NEXT: srli a1, a1, 18 @@ -1713,16 +1715,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-NEXT: or a0, a0, a1 ; RV32_NOZBB-NEXT: not a0, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 1 -; RV32_NOZBB-NEXT: lui a2, 5 -; RV32_NOZBB-NEXT: addi a2, a2, 1365 ; RV32_NOZBB-NEXT: and a1, a1, a2 +; RV32_NOZBB-NEXT: lui a2, 3 +; RV32_NOZBB-NEXT: addi a2, a2, 819 ; RV32_NOZBB-NEXT: sub a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 3 -; RV32_NOZBB-NEXT: addi a1, a1, 819 -; RV32_NOZBB-NEXT: and a2, a0, a1 +; RV32_NOZBB-NEXT: and a1, a0, a2 ; RV32_NOZBB-NEXT: srli a0, a0, 2 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: add a0, a2, a0 +; RV32_NOZBB-NEXT: and a0, a0, a2 +; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 ; RV32_NOZBB-NEXT: andi a1, a0, 15 @@ -1734,7 +1734,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef: ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: slli a1, a0, 48 +; RV64NOZBB-NEXT: lui a2, 5 ; RV64NOZBB-NEXT: srli a1, a1, 49 +; RV64NOZBB-NEXT: addiw a2, a2, 1365 ; RV64NOZBB-NEXT: or a0, a0, a1 ; RV64NOZBB-NEXT: slli a1, a0, 48 ; RV64NOZBB-NEXT: srli a1, a1, 50 @@ -1747,16 +1749,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV64NOZBB-NEXT: or a0, a0, a1 ; RV64NOZBB-NEXT: not a0, a0 ; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 ; RV64NOZBB-NEXT: and a1, a1, a2 +; RV64NOZBB-NEXT: lui a2, 3 +; RV64NOZBB-NEXT: addiw a2, a2, 819 ; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 +; RV64NOZBB-NEXT: and a1, a0, a2 ; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 +; RV64NOZBB-NEXT: and a0, a0, a2 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 15 @@ -1796,31 +1796,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32I-LABEL: test_ctlz_i32_zero_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 16 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -1832,31 +1832,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64I-LABEL: test_ctlz_i32_zero_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -1868,33 +1868,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32M-LABEL: test_ctlz_i32_zero_undef: ; RV32M: # %bb.0: ; RV32M-NEXT: srli a1, a0, 1 +; RV32M-NEXT: lui a2, 349525 ; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 2 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 4 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 8 -; RV32M-NEXT: or a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 16 -; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: addi a1, a2, 1365 +; RV32M-NEXT: srli a2, a0, 2 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 4 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 8 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: srli a2, a0, 16 +; RV32M-NEXT: or a0, a0, a2 ; RV32M-NEXT: not a0, a0 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 349525 -; RV32M-NEXT: addi a2, a2, 1365 -; RV32M-NEXT: and a1, a1, a2 +; RV32M-NEXT: srli a2, a0, 1 +; RV32M-NEXT: and a1, a2, a1 +; RV32M-NEXT: lui a2, 209715 +; RV32M-NEXT: addi a2, a2, 819 ; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: lui a1, 209715 -; RV32M-NEXT: addi a1, a1, 819 -; RV32M-NEXT: and a2, a0, a1 +; RV32M-NEXT: and a1, a0, a2 ; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: add a0, a2, a0 +; RV32M-NEXT: and a0, a0, a2 +; RV32M-NEXT: lui a2, 61681 +; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 61681 -; RV32M-NEXT: addi a1, a1, -241 -; RV32M-NEXT: and a0, a0, a1 ; RV32M-NEXT: lui a1, 4112 +; RV32M-NEXT: addi a2, a2, -241 +; RV32M-NEXT: and a0, a0, a2 ; RV32M-NEXT: addi a1, a1, 257 ; RV32M-NEXT: mul a0, a0, a1 ; RV32M-NEXT: srli a0, a0, 24 @@ -1903,33 +1903,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64M-LABEL: test_ctlz_i32_zero_undef: ; RV64M: # %bb.0: ; RV64M-NEXT: srliw a1, a0, 1 +; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 2 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 4 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 8 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srliw a1, a0, 16 -; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: addiw a1, a2, 1365 +; RV64M-NEXT: srliw a2, a0, 2 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 4 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 8 +; RV64M-NEXT: or a0, a0, a2 +; RV64M-NEXT: srliw a2, a0, 16 +; RV64M-NEXT: or a0, a0, a2 ; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: srli a2, a0, 1 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: lui a2, 209715 +; RV64M-NEXT: addiw a2, a2, 819 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 61681 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 -; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 4112 +; RV64M-NEXT: addi a2, a2, -241 +; RV64M-NEXT: and a0, a0, a2 ; RV64M-NEXT: addi a1, a1, 257 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srliw a0, a0, 24 @@ -1964,11 +1964,11 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-LABEL: test_ctlz_i64_zero_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: bnez a1, .LBB15_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a1, a0, 1 @@ -2033,40 +2033,40 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64I-LABEL: test_ctlz_i64_zero_undef: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: addiw a2, a3, 819 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -2079,13 +2079,13 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32M-LABEL: test_ctlz_i64_zero_undef: ; RV32M: # %bb.0: ; RV32M-NEXT: lui a2, 349525 +; RV32M-NEXT: lui a3, 209715 +; RV32M-NEXT: lui a6, 61681 +; RV32M-NEXT: lui a7, 4112 ; RV32M-NEXT: addi a5, a2, 1365 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a4, a2, 819 -; RV32M-NEXT: lui a2, 61681 -; RV32M-NEXT: addi a2, a2, -241 -; RV32M-NEXT: lui a3, 4112 -; RV32M-NEXT: addi a3, a3, 257 +; RV32M-NEXT: addi a4, a3, 819 +; RV32M-NEXT: addi a3, a6, -241 +; RV32M-NEXT: addi a2, a7, 257 ; RV32M-NEXT: bnez a1, .LBB15_2 ; RV32M-NEXT: # %bb.1: ; RV32M-NEXT: srli a1, a0, 1 @@ -2108,8 +2108,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: mul a0, a0, a3 +; RV32M-NEXT: and a0, a0, a3 +; RV32M-NEXT: mul a0, a0, a2 ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: addi a0, a0, 32 ; RV32M-NEXT: li a1, 0 @@ -2135,8 +2135,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: mul a0, a0, a3 +; RV32M-NEXT: and a0, a0, a3 +; RV32M-NEXT: mul a0, a0, a2 ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: li a1, 0 ; RV32M-NEXT: ret @@ -2144,44 +2144,44 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64M-LABEL: test_ctlz_i64_zero_undef: ; RV64M: # %bb.0: ; RV64M-NEXT: srli a1, a0, 1 +; RV64M-NEXT: lui a2, 349525 +; RV64M-NEXT: lui a3, 209715 +; RV64M-NEXT: lui a4, 61681 ; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 2 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 8 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 16 -; RV64M-NEXT: or a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 32 -; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: addiw a1, a2, 1365 +; RV64M-NEXT: addiw a2, a3, 819 +; RV64M-NEXT: addiw a3, a4, -241 +; RV64M-NEXT: srli a4, a0, 2 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: slli a4, a1, 32 +; RV64M-NEXT: add a1, a1, a4 +; RV64M-NEXT: slli a4, a2, 32 +; RV64M-NEXT: add a2, a2, a4 +; RV64M-NEXT: slli a4, a3, 32 +; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: srli a4, a0, 4 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 8 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 16 +; RV64M-NEXT: or a0, a0, a4 +; RV64M-NEXT: srli a4, a0, 32 +; RV64M-NEXT: or a0, a0, a4 ; RV64M-NEXT: not a0, a0 -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: slli a3, a2, 32 -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: srli a4, a0, 1 +; RV64M-NEXT: and a1, a4, a1 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 4112 +; RV64M-NEXT: addiw a2, a2, 257 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addiw a1, a1, -241 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a1, a2, 32 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: add a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -2304,13 +2304,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: lui a2, 5 ; RV32_NOZBB-NEXT: addi a2, a2, 1365 ; RV32_NOZBB-NEXT: and a1, a1, a2 +; RV32_NOZBB-NEXT: lui a2, 3 +; RV32_NOZBB-NEXT: addi a2, a2, 819 ; RV32_NOZBB-NEXT: sub a0, a0, a1 -; RV32_NOZBB-NEXT: lui a1, 3 -; RV32_NOZBB-NEXT: addi a1, a1, 819 -; RV32_NOZBB-NEXT: and a2, a0, a1 +; RV32_NOZBB-NEXT: and a1, a0, a2 ; RV32_NOZBB-NEXT: srli a0, a0, 2 -; RV32_NOZBB-NEXT: and a0, a0, a1 -; RV32_NOZBB-NEXT: add a0, a2, a0 +; RV32_NOZBB-NEXT: and a0, a0, a2 +; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 ; RV32_NOZBB-NEXT: andi a1, a0, 15 @@ -2325,13 +2325,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; RV64NOZBB-NEXT: lui a2, 5 ; RV64NOZBB-NEXT: addiw a2, a2, 1365 ; RV64NOZBB-NEXT: and a1, a1, a2 +; RV64NOZBB-NEXT: lui a2, 3 +; RV64NOZBB-NEXT: addiw a2, a2, 819 ; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 +; RV64NOZBB-NEXT: and a1, a0, a2 ; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 +; RV64NOZBB-NEXT: and a0, a0, a2 +; RV64NOZBB-NEXT: add a0, a1, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 15 @@ -2358,13 +2358,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; RV32XTHEADBB-NEXT: lui a2, 5 ; RV32XTHEADBB-NEXT: addi a2, a2, 1365 ; RV32XTHEADBB-NEXT: and a1, a1, a2 +; RV32XTHEADBB-NEXT: lui a2, 3 +; RV32XTHEADBB-NEXT: addi a2, a2, 819 ; RV32XTHEADBB-NEXT: sub a0, a0, a1 -; RV32XTHEADBB-NEXT: lui a1, 3 -; RV32XTHEADBB-NEXT: addi a1, a1, 819 -; RV32XTHEADBB-NEXT: and a2, a0, a1 +; RV32XTHEADBB-NEXT: and a1, a0, a2 ; RV32XTHEADBB-NEXT: srli a0, a0, 2 -; RV32XTHEADBB-NEXT: and a0, a0, a1 -; RV32XTHEADBB-NEXT: add a0, a2, a0 +; RV32XTHEADBB-NEXT: and a0, a0, a2 +; RV32XTHEADBB-NEXT: add a0, a1, a0 ; RV32XTHEADBB-NEXT: srli a1, a0, 4 ; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: th.extu a1, a0, 11, 8 @@ -2378,13 +2378,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; RV64XTHEADBB-NEXT: lui a2, 5 ; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 ; RV64XTHEADBB-NEXT: and a1, a1, a2 +; RV64XTHEADBB-NEXT: lui a2, 3 +; RV64XTHEADBB-NEXT: addiw a2, a2, 819 ; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 3 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: and a2, a0, a1 +; RV64XTHEADBB-NEXT: and a1, a0, a2 ; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: add a0, a1, a0 ; RV64XTHEADBB-NEXT: srli a1, a0, 4 ; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: th.extu a1, a0, 11, 8 @@ -2402,17 +2402,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -2427,17 +2427,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -2452,19 +2452,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32M-NEXT: lui a2, 349525 ; RV32M-NEXT: addi a2, a2, 1365 ; RV32M-NEXT: and a1, a1, a2 +; RV32M-NEXT: lui a2, 209715 +; RV32M-NEXT: addi a2, a2, 819 ; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: lui a1, 209715 -; RV32M-NEXT: addi a1, a1, 819 -; RV32M-NEXT: and a2, a0, a1 +; RV32M-NEXT: and a1, a0, a2 ; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: add a0, a2, a0 +; RV32M-NEXT: and a0, a0, a2 +; RV32M-NEXT: lui a2, 61681 +; RV32M-NEXT: add a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 4 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 61681 -; RV32M-NEXT: addi a1, a1, -241 -; RV32M-NEXT: and a0, a0, a1 ; RV32M-NEXT: lui a1, 4112 +; RV32M-NEXT: addi a2, a2, -241 +; RV32M-NEXT: and a0, a0, a2 ; RV32M-NEXT: addi a1, a1, 257 ; RV32M-NEXT: mul a0, a0, a1 ; RV32M-NEXT: srli a0, a0, 24 @@ -2476,19 +2476,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: lui a2, 209715 +; RV64M-NEXT: addiw a2, a2, 819 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 61681 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 -; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 4112 +; RV64M-NEXT: addi a2, a2, -241 +; RV64M-NEXT: and a0, a0, a2 ; RV64M-NEXT: addi a1, a1, 257 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srliw a0, a0, 24 @@ -2510,17 +2510,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32XTHEADBB-NEXT: lui a2, 349525 ; RV32XTHEADBB-NEXT: addi a2, a2, 1365 ; RV32XTHEADBB-NEXT: and a1, a1, a2 +; RV32XTHEADBB-NEXT: lui a2, 209715 +; RV32XTHEADBB-NEXT: addi a2, a2, 819 ; RV32XTHEADBB-NEXT: sub a0, a0, a1 -; RV32XTHEADBB-NEXT: lui a1, 209715 -; RV32XTHEADBB-NEXT: addi a1, a1, 819 -; RV32XTHEADBB-NEXT: and a2, a0, a1 +; RV32XTHEADBB-NEXT: and a1, a0, a2 ; RV32XTHEADBB-NEXT: srli a0, a0, 2 -; RV32XTHEADBB-NEXT: and a0, a0, a1 -; RV32XTHEADBB-NEXT: add a0, a2, a0 +; RV32XTHEADBB-NEXT: and a0, a0, a2 +; RV32XTHEADBB-NEXT: lui a2, 61681 +; RV32XTHEADBB-NEXT: add a0, a1, a0 ; RV32XTHEADBB-NEXT: srli a1, a0, 4 ; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: lui a1, 61681 -; RV32XTHEADBB-NEXT: addi a1, a1, -241 +; RV32XTHEADBB-NEXT: addi a1, a2, -241 ; RV32XTHEADBB-NEXT: and a0, a0, a1 ; RV32XTHEADBB-NEXT: slli a1, a0, 8 ; RV32XTHEADBB-NEXT: add a0, a0, a1 @@ -2535,17 +2535,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64XTHEADBB-NEXT: lui a2, 349525 ; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 ; RV64XTHEADBB-NEXT: and a1, a1, a2 +; RV64XTHEADBB-NEXT: lui a2, 209715 +; RV64XTHEADBB-NEXT: addiw a2, a2, 819 ; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 209715 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: and a2, a0, a1 +; RV64XTHEADBB-NEXT: and a1, a0, a2 ; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: lui a2, 61681 +; RV64XTHEADBB-NEXT: add a0, a1, a0 ; RV64XTHEADBB-NEXT: srli a1, a0, 4 ; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 61681 -; RV64XTHEADBB-NEXT: addi a1, a1, -241 +; RV64XTHEADBB-NEXT: addi a1, a2, -241 ; RV64XTHEADBB-NEXT: and a0, a0, a1 ; RV64XTHEADBB-NEXT: slli a1, a0, 8 ; RV64XTHEADBB-NEXT: add a0, a0, a1 @@ -2562,39 +2562,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a0, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a4, a1, a2 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: srli a4, a1, 4 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a4, a4, -241 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: slli a5, a1, 8 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: slli a5, a1, 16 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a5, a0, 1 ; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: sub a0, a0, a3 -; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: and a2, a1, a4 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a3, a0, a4 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: srli a3, a0, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: slli a2, a1, 8 +; RV32I-NEXT: slli a3, a0, 8 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a2, a1, 16 +; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 0 @@ -2602,28 +2602,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV64I-LABEL: test_ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -2637,35 +2637,35 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32M: # %bb.0: ; RV32M-NEXT: srli a2, a1, 1 ; RV32M-NEXT: lui a3, 349525 +; RV32M-NEXT: lui a4, 209715 +; RV32M-NEXT: lui a5, 61681 +; RV32M-NEXT: srli a6, a0, 1 ; RV32M-NEXT: addi a3, a3, 1365 ; RV32M-NEXT: and a2, a2, a3 -; RV32M-NEXT: sub a1, a1, a2 -; RV32M-NEXT: lui a2, 209715 -; RV32M-NEXT: addi a2, a2, 819 -; RV32M-NEXT: and a4, a1, a2 -; RV32M-NEXT: srli a1, a1, 2 -; RV32M-NEXT: and a1, a1, a2 -; RV32M-NEXT: add a1, a4, a1 -; RV32M-NEXT: srli a4, a1, 4 -; RV32M-NEXT: add a1, a1, a4 -; RV32M-NEXT: lui a4, 61681 -; RV32M-NEXT: addi a4, a4, -241 -; RV32M-NEXT: and a1, a1, a4 -; RV32M-NEXT: lui a5, 4112 -; RV32M-NEXT: addi a5, a5, 257 -; RV32M-NEXT: mul a1, a1, a5 -; RV32M-NEXT: srli a1, a1, 24 -; RV32M-NEXT: srli a6, a0, 1 ; RV32M-NEXT: and a3, a6, a3 +; RV32M-NEXT: lui a6, 4112 +; RV32M-NEXT: addi a4, a4, 819 +; RV32M-NEXT: addi a5, a5, -241 +; RV32M-NEXT: addi a6, a6, 257 +; RV32M-NEXT: sub a1, a1, a2 ; RV32M-NEXT: sub a0, a0, a3 -; RV32M-NEXT: and a3, a0, a2 +; RV32M-NEXT: and a2, a1, a4 +; RV32M-NEXT: srli a1, a1, 2 +; RV32M-NEXT: and a3, a0, a4 ; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: and a0, a0, a2 -; RV32M-NEXT: add a0, a3, a0 -; RV32M-NEXT: srli a2, a0, 4 -; RV32M-NEXT: add a0, a0, a2 +; RV32M-NEXT: and a1, a1, a4 ; RV32M-NEXT: and a0, a0, a4 -; RV32M-NEXT: mul a0, a0, a5 +; RV32M-NEXT: add a1, a2, a1 +; RV32M-NEXT: add a0, a3, a0 +; RV32M-NEXT: srli a2, a1, 4 +; RV32M-NEXT: srli a3, a0, 4 +; RV32M-NEXT: add a1, a1, a2 +; RV32M-NEXT: add a0, a0, a3 +; RV32M-NEXT: and a1, a1, a5 +; RV32M-NEXT: and a0, a0, a5 +; RV32M-NEXT: mul a1, a1, a6 +; RV32M-NEXT: mul a0, a0, a6 +; RV32M-NEXT: srli a1, a1, 24 ; RV32M-NEXT: srli a0, a0, 24 ; RV32M-NEXT: add a0, a0, a1 ; RV32M-NEXT: li a1, 0 @@ -2673,32 +2673,32 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV64M-LABEL: test_ctpop_i64: ; RV64M: # %bb.0: -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: slli a3, a2, 32 -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: lui a1, 349525 +; RV64M-NEXT: lui a2, 209715 +; RV64M-NEXT: lui a3, 61681 +; RV64M-NEXT: addiw a1, a1, 1365 +; RV64M-NEXT: addiw a2, a2, 819 +; RV64M-NEXT: addiw a3, a3, -241 +; RV64M-NEXT: slli a4, a1, 32 +; RV64M-NEXT: add a1, a1, a4 +; RV64M-NEXT: slli a4, a2, 32 +; RV64M-NEXT: add a2, a2, a4 +; RV64M-NEXT: slli a4, a3, 32 +; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: srli a4, a0, 1 +; RV64M-NEXT: and a1, a4, a1 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 +; RV64M-NEXT: and a1, a0, a2 ; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: lui a2, 4112 +; RV64M-NEXT: addiw a2, a2, 257 +; RV64M-NEXT: add a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 4 ; RV64M-NEXT: add a0, a0, a1 -; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addiw a1, a1, -241 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a1, a2, 32 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: add a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -2720,39 +2720,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: srli a2, a1, 1 ; RV32XTHEADBB-NEXT: lui a3, 349525 +; RV32XTHEADBB-NEXT: lui a4, 209715 +; RV32XTHEADBB-NEXT: srli a5, a0, 1 ; RV32XTHEADBB-NEXT: addi a3, a3, 1365 ; RV32XTHEADBB-NEXT: and a2, a2, a3 -; RV32XTHEADBB-NEXT: sub a1, a1, a2 -; RV32XTHEADBB-NEXT: lui a2, 209715 -; RV32XTHEADBB-NEXT: addi a2, a2, 819 -; RV32XTHEADBB-NEXT: and a4, a1, a2 -; RV32XTHEADBB-NEXT: srli a1, a1, 2 -; RV32XTHEADBB-NEXT: and a1, a1, a2 -; RV32XTHEADBB-NEXT: add a1, a4, a1 -; RV32XTHEADBB-NEXT: srli a4, a1, 4 -; RV32XTHEADBB-NEXT: add a1, a1, a4 -; RV32XTHEADBB-NEXT: lui a4, 61681 -; RV32XTHEADBB-NEXT: addi a4, a4, -241 -; RV32XTHEADBB-NEXT: and a1, a1, a4 -; RV32XTHEADBB-NEXT: slli a5, a1, 8 -; RV32XTHEADBB-NEXT: add a1, a1, a5 -; RV32XTHEADBB-NEXT: slli a5, a1, 16 -; RV32XTHEADBB-NEXT: add a1, a1, a5 -; RV32XTHEADBB-NEXT: srli a1, a1, 24 -; RV32XTHEADBB-NEXT: srli a5, a0, 1 ; RV32XTHEADBB-NEXT: and a3, a5, a3 +; RV32XTHEADBB-NEXT: lui a5, 61681 +; RV32XTHEADBB-NEXT: addi a4, a4, 819 +; RV32XTHEADBB-NEXT: addi a5, a5, -241 +; RV32XTHEADBB-NEXT: sub a1, a1, a2 ; RV32XTHEADBB-NEXT: sub a0, a0, a3 -; RV32XTHEADBB-NEXT: and a3, a0, a2 +; RV32XTHEADBB-NEXT: and a2, a1, a4 +; RV32XTHEADBB-NEXT: srli a1, a1, 2 +; RV32XTHEADBB-NEXT: and a3, a0, a4 ; RV32XTHEADBB-NEXT: srli a0, a0, 2 -; RV32XTHEADBB-NEXT: and a0, a0, a2 -; RV32XTHEADBB-NEXT: add a0, a3, a0 -; RV32XTHEADBB-NEXT: srli a2, a0, 4 -; RV32XTHEADBB-NEXT: add a0, a0, a2 +; RV32XTHEADBB-NEXT: and a1, a1, a4 ; RV32XTHEADBB-NEXT: and a0, a0, a4 -; RV32XTHEADBB-NEXT: slli a2, a0, 8 -; RV32XTHEADBB-NEXT: add a0, a0, a2 -; RV32XTHEADBB-NEXT: slli a2, a0, 16 -; RV32XTHEADBB-NEXT: add a0, a0, a2 +; RV32XTHEADBB-NEXT: add a1, a2, a1 +; RV32XTHEADBB-NEXT: add a0, a3, a0 +; RV32XTHEADBB-NEXT: srli a2, a1, 4 +; RV32XTHEADBB-NEXT: srli a3, a0, 4 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: add a0, a0, a3 +; RV32XTHEADBB-NEXT: and a1, a1, a5 +; RV32XTHEADBB-NEXT: and a0, a0, a5 +; RV32XTHEADBB-NEXT: slli a2, a1, 8 +; RV32XTHEADBB-NEXT: slli a3, a0, 8 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: add a0, a0, a3 +; RV32XTHEADBB-NEXT: slli a2, a1, 16 +; RV32XTHEADBB-NEXT: slli a3, a0, 16 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: add a0, a0, a3 +; RV32XTHEADBB-NEXT: srli a1, a1, 24 ; RV32XTHEADBB-NEXT: srli a0, a0, 24 ; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: li a1, 0 @@ -2760,28 +2760,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV64XTHEADBB-LABEL: test_ctpop_i64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srli a1, a0, 1 -; RV64XTHEADBB-NEXT: lui a2, 349525 -; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 +; RV64XTHEADBB-NEXT: lui a1, 349525 +; RV64XTHEADBB-NEXT: lui a2, 209715 +; RV64XTHEADBB-NEXT: addiw a1, a1, 1365 +; RV64XTHEADBB-NEXT: addiw a2, a2, 819 +; RV64XTHEADBB-NEXT: slli a3, a1, 32 +; RV64XTHEADBB-NEXT: add a1, a1, a3 ; RV64XTHEADBB-NEXT: slli a3, a2, 32 ; RV64XTHEADBB-NEXT: add a2, a2, a3 -; RV64XTHEADBB-NEXT: and a1, a1, a2 +; RV64XTHEADBB-NEXT: srli a3, a0, 1 +; RV64XTHEADBB-NEXT: and a1, a3, a1 +; RV64XTHEADBB-NEXT: lui a3, 61681 +; RV64XTHEADBB-NEXT: addiw a3, a3, -241 ; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 209715 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: slli a2, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a2 -; RV64XTHEADBB-NEXT: and a2, a0, a1 +; RV64XTHEADBB-NEXT: and a1, a0, a2 ; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: slli a2, a3, 32 +; RV64XTHEADBB-NEXT: add a0, a1, a0 ; RV64XTHEADBB-NEXT: srli a1, a0, 4 ; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 61681 -; RV64XTHEADBB-NEXT: addiw a1, a1, -241 -; RV64XTHEADBB-NEXT: slli a2, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 +; RV64XTHEADBB-NEXT: add a2, a3, a2 +; RV64XTHEADBB-NEXT: and a0, a0, a2 ; RV64XTHEADBB-NEXT: slli a1, a0, 8 ; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: slli a1, a0, 16 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index fe6e20d852d59..03a6a6b1c4b7d 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -603,11 +603,11 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV32I-LABEL: ctlz: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: bnez a1, .LBB7_2 ; RV32I-NEXT: # %bb.1: # %entry ; RV32I-NEXT: srli a1, a0, 1 @@ -672,40 +672,40 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV64I-LABEL: ctlz: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: addiw a2, a3, 819 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll index 3d9fb91e3adf8..844fa0d1e6ad6 100644 --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -80,25 +80,25 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind { ; RV32-LABEL: udiv64_constant_no_add: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 838861 -; RV32-NEXT: addi a4, a3, -819 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: srli a6, a5, 2 -; RV32-NEXT: andi a5, a5, -4 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, -819 ; RV32-NEXT: addi a3, a3, -820 -; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: srli a6, a4, 2 +; RV32-NEXT: andi a4, a4, -4 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: sub a2, a2, a4 +; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a4, a3 +; RV32-NEXT: mulhu a3, a4, a5 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a4, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: udiv64_constant_no_add: @@ -485,8 +485,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_no_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: li a1, 86 +; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 ; RV32IM-NEXT: srli a0, a0, 8 @@ -506,8 +506,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_constant_no_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: li a1, 86 +; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 ; RV64IM-NEXT: srli a0, a0, 8 @@ -531,8 +531,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: li a1, 103 +; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 ; RV32IM-NEXT: srai a0, a0, 9 @@ -552,8 +552,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_constant_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: li a1, 103 +; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 ; RV64IM-NEXT: srai a0, a0, 9 @@ -577,8 +577,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_add_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 24 -; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: li a2, -109 +; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: srli a1, a1, 8 ; RV32IM-NEXT: add a0, a1, a0 @@ -604,8 +604,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_constant_add_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 56 -; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: li a2, -109 +; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 ; RV64IM-NEXT: add a0, a1, a0 @@ -635,8 +635,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_sub_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 24 -; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: li a2, 109 +; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: srli a1, a1, 8 ; RV32IM-NEXT: sub a1, a1, a0 @@ -662,8 +662,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_constant_sub_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 56 -; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: li a2, 109 +; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 ; RV64IM-NEXT: subw a1, a1, a0 @@ -693,8 +693,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_no_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: lui a1, 5 +; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: addi a1, a1, 1366 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 @@ -716,8 +716,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_constant_no_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: lui a1, 5 +; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: addiw a1, a1, 1366 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 @@ -743,8 +743,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: lui a1, 6 +; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 @@ -766,8 +766,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_constant_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: lui a1, 6 +; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: addiw a1, a1, 1639 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 @@ -793,8 +793,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_add_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 16 -; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: lui a2, 1048569 +; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: addi a2, a2, -1911 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: srli a1, a1, 16 @@ -822,8 +822,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_constant_add_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 48 -; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: lui a2, 1048569 +; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: addiw a2, a2, -1911 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 16 @@ -855,8 +855,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_sub_srai: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 16 -; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: lui a2, 7 +; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: addi a2, a2, 1911 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: srli a1, a1, 16 @@ -884,8 +884,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_constant_sub_srai: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 48 -; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: lui a2, 7 +; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: addiw a2, a2, 1911 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 16 diff --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll index 254e675b4ed8b..6ea5a37ba2963 100644 --- a/llvm/test/CodeGen/RISCV/div-pow2.ll +++ b/llvm/test/CodeGen/RISCV/div-pow2.ll @@ -207,14 +207,14 @@ define i64 @sdiv64_pow2_negative_2(i64 %a) { ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: srli a3, a2, 1 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: slli a0, a1, 31 -; RV32I-NEXT: or a3, a3, a0 -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: srai a1, a1, 1 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: slli a1, a0, 31 +; RV32I-NEXT: srai a2, a0, 1 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_2: @@ -263,14 +263,14 @@ define i64 @sdiv64_pow2_negative_2048(i64 %a) { ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: srli a3, a2, 11 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: slli a0, a1, 21 -; RV32I-NEXT: or a3, a3, a0 -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: srai a1, a1, 11 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: slli a1, a0, 21 +; RV32I-NEXT: srai a2, a0, 11 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_2048: @@ -320,14 +320,14 @@ define i64 @sdiv64_pow2_negative_4096(i64 %a) { ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: srli a3, a2, 12 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: slli a0, a1, 20 -; RV32I-NEXT: or a3, a3, a0 -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: srai a1, a1, 12 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: slli a1, a0, 20 +; RV32I-NEXT: srai a2, a0, 12 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_4096: @@ -377,14 +377,14 @@ define i64 @sdiv64_pow2_negative_65536(i64 %a) { ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: srli a3, a2, 16 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: or a3, a3, a0 -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: srai a1, a1, 16 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a2, a0, 16 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_65536: diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll index e94efbea8376d..bda6ff43a5e7c 100644 --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -23,8 +23,8 @@ define i32 @udiv(i32 %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: call __udivdi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -182,25 +182,25 @@ define i64 @udiv64_constant(i64 %a) nounwind { ; RV32IM-LABEL: udiv64_constant: ; RV32IM: # %bb.0: ; RV32IM-NEXT: add a2, a0, a1 -; RV32IM-NEXT: sltu a3, a2, a0 -; RV32IM-NEXT: add a2, a2, a3 ; RV32IM-NEXT: lui a3, 838861 -; RV32IM-NEXT: addi a4, a3, -819 -; RV32IM-NEXT: mulhu a5, a2, a4 -; RV32IM-NEXT: srli a6, a5, 2 -; RV32IM-NEXT: andi a5, a5, -4 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sub a5, a0, a2 +; RV32IM-NEXT: sltu a4, a2, a0 +; RV32IM-NEXT: addi a5, a3, -819 ; RV32IM-NEXT: addi a3, a3, -820 -; RV32IM-NEXT: mul a3, a5, a3 -; RV32IM-NEXT: mulhu a6, a5, a4 -; RV32IM-NEXT: add a3, a6, a3 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: mulhu a4, a2, a5 +; RV32IM-NEXT: srli a6, a4, 2 +; RV32IM-NEXT: andi a4, a4, -4 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sub a4, a0, a2 ; RV32IM-NEXT: sltu a0, a0, a2 +; RV32IM-NEXT: mul a2, a4, a3 +; RV32IM-NEXT: mulhu a3, a4, a5 ; RV32IM-NEXT: sub a1, a1, a0 -; RV32IM-NEXT: mul a1, a1, a4 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: mul a0, a5, a4 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: mul a1, a1, a5 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: mul a0, a4, a5 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: udiv64_constant: @@ -919,8 +919,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: call __divsi3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -930,8 +930,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind { ; RV32IM-LABEL: sdiv8: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a1, 24 -; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: div a0, a0, a1 ; RV32IM-NEXT: ret @@ -941,8 +941,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: call __divdi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -952,8 +952,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind { ; RV64IM-LABEL: sdiv8: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a1, 56 -; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: divw a0, a0, a1 ; RV64IM-NEXT: ret @@ -977,8 +977,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: li a1, 103 +; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 ; RV32IM-NEXT: srai a0, a0, 9 @@ -1000,8 +1000,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_constant: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: li a1, 103 +; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 ; RV64IM-NEXT: srai a0, a0, 9 @@ -1105,8 +1105,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: call __divsi3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1116,8 +1116,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind { ; RV32IM-LABEL: sdiv16: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a1, 16 -; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: div a0, a0, a1 ; RV32IM-NEXT: ret @@ -1127,8 +1127,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: call __divdi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1138,8 +1138,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind { ; RV64IM-LABEL: sdiv16: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a1, 48 -; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: divw a0, a0, a1 ; RV64IM-NEXT: ret @@ -1163,8 +1163,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: lui a1, 6 +; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: srli a1, a0, 31 @@ -1187,8 +1187,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_constant: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: lui a1, 6 +; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: addiw a1, a1, 1639 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll index 5f06398daa8b9..44d7bc590a797 100644 --- a/llvm/test/CodeGen/RISCV/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/double-arith.ll @@ -225,8 +225,8 @@ define double @fsgnj_d(double %a, double %b) nounwind { ; RV32I-LABEL: fsgnj_d: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: and a2, a3, a2 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a2, a3, a2 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret @@ -234,8 +234,8 @@ define double @fsgnj_d(double %a, double %b) nounwind { ; RV64I-LABEL: fsgnj_d: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a1, 63 -; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -327,8 +327,8 @@ define double @fsgnjn_d(double %a, double %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: not a2, a3 ; RV32I-NEXT: lui a3, 524288 -; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret @@ -1524,8 +1524,8 @@ define double @fsgnjx_f64(double %x, double %y) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a0, a0, 63 -; RV64I-NEXT: slli a0, a0, 63 ; RV64I-NEXT: li a2, 1023 +; RV64I-NEXT: slli a0, a0, 63 ; RV64I-NEXT: slli a2, a2, 52 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: call __muldf3 diff --git a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll index 01aa25c15c8d2..14193bf4cb169 100644 --- a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll +++ b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll @@ -112,8 +112,8 @@ define double @fcopysign_fneg(double %a, double %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: not a2, a3 ; RV32I-NEXT: lui a3, 524288 -; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a2, a2, a3 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll index b9e80dccd97b9..798eac64e9fc2 100644 --- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll @@ -42,11 +42,11 @@ define double @caller_double_inreg() nounwind { ; RV32IFD-NEXT: addi sp, sp, -16 ; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: lui a0, 262236 +; RV32IFD-NEXT: lui a2, 377487 +; RV32IFD-NEXT: lui a3, 262364 ; RV32IFD-NEXT: addi a1, a0, 655 -; RV32IFD-NEXT: lui a0, 377487 -; RV32IFD-NEXT: addi a0, a0, 1475 -; RV32IFD-NEXT: lui a2, 262364 -; RV32IFD-NEXT: addi a3, a2, 655 +; RV32IFD-NEXT: addi a0, a2, 1475 +; RV32IFD-NEXT: addi a3, a3, 655 ; RV32IFD-NEXT: mv a2, a0 ; RV32IFD-NEXT: call callee_double_inreg ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -58,11 +58,11 @@ define double @caller_double_inreg() nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -16 ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: lui a0, 262236 +; RV32IZFINXZDINX-NEXT: lui a2, 377487 +; RV32IZFINXZDINX-NEXT: lui a3, 262364 ; RV32IZFINXZDINX-NEXT: addi a1, a0, 655 -; RV32IZFINXZDINX-NEXT: lui a0, 377487 -; RV32IZFINXZDINX-NEXT: addi a0, a0, 1475 -; RV32IZFINXZDINX-NEXT: lui a2, 262364 -; RV32IZFINXZDINX-NEXT: addi a3, a2, 655 +; RV32IZFINXZDINX-NEXT: addi a0, a2, 1475 +; RV32IZFINXZDINX-NEXT: addi a3, a3, 655 ; RV32IZFINXZDINX-NEXT: mv a2, a0 ; RV32IZFINXZDINX-NEXT: call callee_double_inreg ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -107,14 +107,14 @@ define double @caller_double_split_reg_stack() nounwind { ; RV32IFD: # %bb.0: ; RV32IFD-NEXT: addi sp, sp, -16 ; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lui a0, 262510 -; RV32IFD-NEXT: addi a2, a0, 327 -; RV32IFD-NEXT: lui a0, 262446 -; RV32IFD-NEXT: addi a6, a0, 327 -; RV32IFD-NEXT: lui a0, 713032 -; RV32IFD-NEXT: addi a5, a0, -1311 +; RV32IFD-NEXT: lui a2, 262510 +; RV32IFD-NEXT: lui a3, 262446 +; RV32IFD-NEXT: lui a4, 713032 ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: li a1, 2 +; RV32IFD-NEXT: addi a2, a2, 327 +; RV32IFD-NEXT: addi a6, a3, 327 +; RV32IFD-NEXT: addi a5, a4, -1311 ; RV32IFD-NEXT: li a3, 3 ; RV32IFD-NEXT: sw a2, 0(sp) ; RV32IFD-NEXT: li a2, 0 @@ -129,14 +129,14 @@ define double @caller_double_split_reg_stack() nounwind { ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: addi sp, sp, -16 ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lui a0, 262510 -; RV32IZFINXZDINX-NEXT: addi a2, a0, 327 -; RV32IZFINXZDINX-NEXT: lui a0, 262446 -; RV32IZFINXZDINX-NEXT: addi a6, a0, 327 -; RV32IZFINXZDINX-NEXT: lui a0, 713032 -; RV32IZFINXZDINX-NEXT: addi a5, a0, -1311 +; RV32IZFINXZDINX-NEXT: lui a2, 262510 +; RV32IZFINXZDINX-NEXT: lui a3, 262446 +; RV32IZFINXZDINX-NEXT: lui a4, 713032 ; RV32IZFINXZDINX-NEXT: li a0, 1 ; RV32IZFINXZDINX-NEXT: li a1, 2 +; RV32IZFINXZDINX-NEXT: addi a2, a2, 327 +; RV32IZFINXZDINX-NEXT: addi a6, a3, 327 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1311 ; RV32IZFINXZDINX-NEXT: li a3, 3 ; RV32IZFINXZDINX-NEXT: sw a2, 0(sp) ; RV32IZFINXZDINX-NEXT: li a2, 0 @@ -180,16 +180,16 @@ define double @caller_double_stack() nounwind { ; RV32IFD: # %bb.0: ; RV32IFD-NEXT: addi sp, sp, -32 ; RV32IFD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lui a0, 262510 -; RV32IFD-NEXT: addi a1, a0, 327 -; RV32IFD-NEXT: lui a0, 713032 -; RV32IFD-NEXT: addi a3, a0, -1311 -; RV32IFD-NEXT: lui a0, 262574 -; RV32IFD-NEXT: addi a5, a0, 327 +; RV32IFD-NEXT: lui a1, 262510 +; RV32IFD-NEXT: lui a3, 713032 +; RV32IFD-NEXT: lui a5, 262574 ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: li a2, 2 ; RV32IFD-NEXT: li a4, 3 ; RV32IFD-NEXT: li a6, 4 +; RV32IFD-NEXT: addi a1, a1, 327 +; RV32IFD-NEXT: addi a3, a3, -1311 +; RV32IFD-NEXT: addi a5, a5, 327 ; RV32IFD-NEXT: sw a3, 0(sp) ; RV32IFD-NEXT: sw a1, 4(sp) ; RV32IFD-NEXT: sw a3, 8(sp) @@ -207,16 +207,16 @@ define double @caller_double_stack() nounwind { ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lui a0, 262510 -; RV32IZFINXZDINX-NEXT: addi a1, a0, 327 -; RV32IZFINXZDINX-NEXT: lui a0, 713032 -; RV32IZFINXZDINX-NEXT: addi a3, a0, -1311 -; RV32IZFINXZDINX-NEXT: lui a0, 262574 -; RV32IZFINXZDINX-NEXT: addi a5, a0, 327 +; RV32IZFINXZDINX-NEXT: lui a1, 262510 +; RV32IZFINXZDINX-NEXT: lui a3, 713032 +; RV32IZFINXZDINX-NEXT: lui a5, 262574 ; RV32IZFINXZDINX-NEXT: li a0, 1 ; RV32IZFINXZDINX-NEXT: li a2, 2 ; RV32IZFINXZDINX-NEXT: li a4, 3 ; RV32IZFINXZDINX-NEXT: li a6, 4 +; RV32IZFINXZDINX-NEXT: addi a1, a1, 327 +; RV32IZFINXZDINX-NEXT: addi a3, a3, -1311 +; RV32IZFINXZDINX-NEXT: addi a5, a5, 327 ; RV32IZFINXZDINX-NEXT: sw a3, 0(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 4(sp) ; RV32IZFINXZDINX-NEXT: sw a3, 8(sp) diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index a8b141618bbb3..c39085a80ddc1 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -692,7 +692,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB12_2 ; RV32IFD-NEXT: # %bb.1: # %start @@ -700,19 +700,19 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IFD-NEXT: .LBB12_2: # %start ; RV32IFD-NEXT: lui a1, %hi(.LCPI12_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI12_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB12_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB12_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB12_4: # %start -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -740,29 +740,29 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI12_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI12_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI12_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 -; RV32IZFINXZDINX-NEXT: beqz a2, .LBB12_2 +; RV32IZFINXZDINX-NEXT: fle.d a3, a2, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB12_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: # %start -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB12_2: # %start ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI12_1) ; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI12_1)(a1) ; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI12_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB12_4 +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB12_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB12_4: # %start -; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 ; RV32IZFINXZDINX-NEXT: neg a5, a1 -; RV32IZFINXZDINX-NEXT: and a1, a5, a3 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -800,14 +800,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: li a2, 0 ; RV32I-NEXT: call __gedf2 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __fixdfdi -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: mv s5, a1 ; RV32I-NEXT: lui a0, 524288 -; RV32I-NEXT: bgez s3, .LBB12_2 +; RV32I-NEXT: bgez s4, .LBB12_2 ; RV32I-NEXT: # %bb.1: # %start ; RV32I-NEXT: lui s5, 524288 ; RV32I-NEXT: .LBB12_2: # %start @@ -821,14 +821,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: call __unorddf2 ; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: slti a1, s4, 0 +; RV32I-NEXT: sgtz a2, s2 ; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 ; RV32I-NEXT: and a1, a0, s5 -; RV32I-NEXT: slti a2, s3, 0 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a2, a2, s4 -; RV32I-NEXT: sgtz a3, s2 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: and a3, a3, s3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -954,10 +954,10 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind { ; RV32IFD-NEXT: lui a2, %hi(.LCPI14_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI14_0)(a2) ; RV32IFD-NEXT: and a0, s0, a0 +; RV32IFD-NEXT: and a1, s0, a1 ; RV32IFD-NEXT: flt.d a2, fa5, fs0 ; RV32IFD-NEXT: neg a2, a2 ; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a1, s0, a1 ; RV32IFD-NEXT: or a1, a2, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -985,16 +985,16 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI14_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI14_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI14_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1037,8 +1037,8 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind { ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __fixunsdfdi ; RV32I-NEXT: and a0, s3, a0 -; RV32I-NEXT: or a0, s2, a0 ; RV32I-NEXT: and a1, s3, a1 +; RV32I-NEXT: or a0, s2, a0 ; RV32I-NEXT: or a1, s2, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1629,8 +1629,8 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV32IFD-NEXT: lui a0, %hi(.LCPI26_1) ; RV32IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a0) ; RV32IFD-NEXT: feq.d a0, fa0, fa0 -; RV32IFD-NEXT: neg a0, a0 ; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: neg a0, a0 ; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.w.d a1, fa5, rtz ; RV32IFD-NEXT: and a0, a0, a1 @@ -1643,8 +1643,8 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV64IFD-NEXT: lui a0, %hi(.LCPI26_1) ; RV64IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: neg a0, a0 ; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: neg a0, a0 ; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 @@ -1658,26 +1658,26 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI26_1) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI26_1+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI26_1)(a4) -; RV32IZFINXZDINX-NEXT: feq.d a6, a0, a0 -; RV32IZFINXZDINX-NEXT: neg a6, a6 -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 -; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a4 -; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz -; RV32IZFINXZDINX-NEXT: and a0, a6, a0 +; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a0, a0 +; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz +; RV32IZFINXZDINX-NEXT: and a0, a0, a1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: li a1, -505 ; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) +; RV64IZFINXZDINX-NEXT: slli a1, a1, 53 ; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI26_0)(a2) -; RV64IZFINXZDINX-NEXT: li a3, -505 -; RV64IZFINXZDINX-NEXT: slli a3, a3, 53 -; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a3 -; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 -; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz -; RV64IZFINXZDINX-NEXT: and a0, a1, a0 +; RV64IZFINXZDINX-NEXT: fmax.d a1, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a0, a0 +; RV64IZFINXZDINX-NEXT: fmin.d a1, a1, a2 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a1, rtz +; RV64IZFINXZDINX-NEXT: and a0, a0, a1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i16: @@ -2004,8 +2004,8 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV32IFD-NEXT: lui a0, %hi(.LCPI30_1) ; RV32IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a0) ; RV32IFD-NEXT: feq.d a0, fa0, fa0 -; RV32IFD-NEXT: neg a0, a0 ; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: neg a0, a0 ; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.w.d a1, fa5, rtz ; RV32IFD-NEXT: and a0, a0, a1 @@ -2018,8 +2018,8 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV64IFD-NEXT: lui a0, %hi(.LCPI30_1) ; RV64IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: neg a0, a0 ; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: neg a0, a0 ; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 @@ -2033,22 +2033,22 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI30_1) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI30_1+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI30_1)(a4) -; RV32IZFINXZDINX-NEXT: feq.d a6, a0, a0 -; RV32IZFINXZDINX-NEXT: neg a6, a6 -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 -; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a4 -; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz -; RV32IZFINXZDINX-NEXT: and a0, a6, a0 +; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a0, a0 +; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz +; RV32IZFINXZDINX-NEXT: and a0, a0, a1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8: ; RV64IZFINXZDINX: # %bb.0: # %start ; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a1, a1 ; RV64IZFINXZDINX-NEXT: li a2, -509 ; RV64IZFINXZDINX-NEXT: slli a2, a2, 53 ; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: lui a2, 65919 +; RV64IZFINXZDINX-NEXT: neg a1, a1 ; RV64IZFINXZDINX-NEXT: slli a2, a2, 34 ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll index 2294171d95ab2..155827ad069cc 100644 --- a/llvm/test/CodeGen/RISCV/double-imm.ll +++ b/llvm/test/CodeGen/RISCV/double-imm.ll @@ -24,8 +24,8 @@ define double @double_imm() nounwind { ; CHECKRV32ZDINX-LABEL: double_imm: ; CHECKRV32ZDINX: # %bb.0: ; CHECKRV32ZDINX-NEXT: lui a0, 345155 -; CHECKRV32ZDINX-NEXT: addi a0, a0, -744 ; CHECKRV32ZDINX-NEXT: lui a1, 262290 +; CHECKRV32ZDINX-NEXT: addi a0, a0, -744 ; CHECKRV32ZDINX-NEXT: addi a1, a1, 507 ; CHECKRV32ZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll index a65fd09613424..3ef128ed6d4cd 100644 --- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll @@ -817,8 +817,8 @@ define double @copysign_f64(double %a, double %b) nounwind { ; RV32I-LABEL: copysign_f64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: and a2, a3, a2 ; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: and a2, a3, a2 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: ret @@ -826,8 +826,8 @@ define double @copysign_f64(double %a, double %b) nounwind { ; RV64I-LABEL: copysign_f64: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a1, 63 -; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: slli a1, a1, 63 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -1535,8 +1535,8 @@ define i1 @isnan_d_fpclass(double %x) { ; RV64I-LABEL: isnan_d_fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 1 -; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: li a1, 2047 +; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: slli a1, a1, 52 ; RV64I-NEXT: slt a0, a1, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll index 38cb52b6f4b30..dba9489e7511d 100644 --- a/llvm/test/CodeGen/RISCV/double-mem.ll +++ b/llvm/test/CodeGen/RISCV/double-mem.ll @@ -93,17 +93,17 @@ define dso_local double @fld_fsd_global(double %a, double %b) nounwind { ; ; RV32IZFINXZDINX-LABEL: fld_fsd_global: ; RV32IZFINXZDINX: # %bb.0: +; RV32IZFINXZDINX-NEXT: lui a4, %hi(G) ; RV32IZFINXZDINX-NEXT: fadd.d a0, a0, a2 -; RV32IZFINXZDINX-NEXT: lui a2, %hi(G) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(G)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(G+4)(a2) -; RV32IZFINXZDINX-NEXT: addi a3, a2, %lo(G) -; RV32IZFINXZDINX-NEXT: sw a0, %lo(G)(a2) -; RV32IZFINXZDINX-NEXT: sw a1, %lo(G+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a4, 72(a3) -; RV32IZFINXZDINX-NEXT: lw a5, 76(a3) -; RV32IZFINXZDINX-NEXT: sw a0, 72(a3) -; RV32IZFINXZDINX-NEXT: sw a1, 76(a3) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(G)(a4) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(G+4)(a4) +; RV32IZFINXZDINX-NEXT: addi a2, a4, %lo(G) +; RV32IZFINXZDINX-NEXT: sw a0, %lo(G)(a4) +; RV32IZFINXZDINX-NEXT: sw a1, %lo(G+4)(a4) +; RV32IZFINXZDINX-NEXT: lw a4, 72(a2) +; RV32IZFINXZDINX-NEXT: lw a5, 76(a2) +; RV32IZFINXZDINX-NEXT: sw a0, 72(a2) +; RV32IZFINXZDINX-NEXT: sw a1, 76(a2) ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fld_fsd_global: diff --git a/llvm/test/CodeGen/RISCV/double-previous-failure.ll b/llvm/test/CodeGen/RISCV/double-previous-failure.ll index c169b1099b273..c5a7ee79364c6 100644 --- a/llvm/test/CodeGen/RISCV/double-previous-failure.ll +++ b/llvm/test/CodeGen/RISCV/double-previous-failure.ll @@ -28,8 +28,8 @@ define i32 @main() nounwind { ; RV32IFD-NEXT: call test ; RV32IFD-NEXT: sw a0, 0(sp) ; RV32IFD-NEXT: sw a1, 4(sp) -; RV32IFD-NEXT: fld fa5, 0(sp) ; RV32IFD-NEXT: lui a0, %hi(.LCPI1_0) +; RV32IFD-NEXT: fld fa5, 0(sp) ; RV32IFD-NEXT: fld fa4, %lo(.LCPI1_0)(a0) ; RV32IFD-NEXT: flt.d a0, fa5, fa4 ; RV32IFD-NEXT: bnez a0, .LBB1_3 diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll index 0839f61b2d793..cd87f2d2301d7 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll @@ -53,7 +53,7 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB1_2 ; RV32IFD-NEXT: # %bb.1: @@ -61,19 +61,19 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB1_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI1_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI1_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB1_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB1_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB1_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -100,31 +100,31 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI1_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI1_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI1_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI1_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB1_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB1_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB1_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB1_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB1_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -186,15 +186,15 @@ define i64 @test_floor_ui64(double %x) nounwind { ; RV32IFD-NEXT: call floor ; RV32IFD-NEXT: lui a0, %hi(.LCPI3_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI3_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -223,16 +223,16 @@ define i64 @test_floor_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI3_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI3_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI3_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -297,7 +297,7 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB5_2 ; RV32IFD-NEXT: # %bb.1: @@ -305,19 +305,19 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB5_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI5_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI5_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB5_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB5_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB5_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -344,31 +344,31 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI5_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI5_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI5_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI5_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI5_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB5_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB5_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB5_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB5_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB5_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -430,15 +430,15 @@ define i64 @test_ceil_ui64(double %x) nounwind { ; RV32IFD-NEXT: call ceil ; RV32IFD-NEXT: lui a0, %hi(.LCPI7_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI7_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -467,16 +467,16 @@ define i64 @test_ceil_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI7_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI7_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI7_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -541,7 +541,7 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB9_2 ; RV32IFD-NEXT: # %bb.1: @@ -549,19 +549,19 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB9_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI9_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI9_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB9_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB9_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB9_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -588,31 +588,31 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI9_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI9_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI9_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI9_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI9_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI9_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI9_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB9_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB9_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB9_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB9_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB9_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -674,15 +674,15 @@ define i64 @test_trunc_ui64(double %x) nounwind { ; RV32IFD-NEXT: call trunc ; RV32IFD-NEXT: lui a0, %hi(.LCPI11_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -711,16 +711,16 @@ define i64 @test_trunc_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI11_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI11_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI11_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -785,7 +785,7 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB13_2 ; RV32IFD-NEXT: # %bb.1: @@ -793,19 +793,19 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB13_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI13_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI13_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB13_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB13_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB13_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -832,31 +832,31 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI13_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI13_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI13_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI13_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI13_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI13_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI13_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB13_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB13_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB13_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB13_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB13_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -918,15 +918,15 @@ define i64 @test_round_ui64(double %x) nounwind { ; RV32IFD-NEXT: call round ; RV32IFD-NEXT: lui a0, %hi(.LCPI15_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI15_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -955,16 +955,16 @@ define i64 @test_round_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI15_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI15_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI15_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1029,7 +1029,7 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB17_2 ; RV32IFD-NEXT: # %bb.1: @@ -1037,19 +1037,19 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB17_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI17_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI17_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB17_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB17_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB17_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -1076,31 +1076,31 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI17_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI17_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI17_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI17_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI17_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI17_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI17_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB17_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB17_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB17_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB17_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB17_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1162,15 +1162,15 @@ define i64 @test_roundeven_ui64(double %x) nounwind { ; RV32IFD-NEXT: call roundeven ; RV32IFD-NEXT: lui a0, %hi(.LCPI19_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1199,16 +1199,16 @@ define i64 @test_roundeven_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI19_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI19_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI19_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1273,7 +1273,7 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi -; RV32IFD-NEXT: lui a4, 524288 +; RV32IFD-NEXT: lui a3, 524288 ; RV32IFD-NEXT: lui a2, 524288 ; RV32IFD-NEXT: beqz s0, .LBB21_2 ; RV32IFD-NEXT: # %bb.1: @@ -1281,19 +1281,19 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IFD-NEXT: .LBB21_2: ; RV32IFD-NEXT: lui a1, %hi(.LCPI21_1) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI21_1)(a1) -; RV32IFD-NEXT: flt.d a3, fa5, fs0 -; RV32IFD-NEXT: beqz a3, .LBB21_4 +; RV32IFD-NEXT: flt.d a1, fa5, fs0 +; RV32IFD-NEXT: beqz a1, .LBB21_4 ; RV32IFD-NEXT: # %bb.3: -; RV32IFD-NEXT: addi a2, a4, -1 +; RV32IFD-NEXT: addi a2, a3, -1 ; RV32IFD-NEXT: .LBB21_4: -; RV32IFD-NEXT: feq.d a1, fs0, fs0 +; RV32IFD-NEXT: feq.d a3, fs0, fs0 ; RV32IFD-NEXT: neg a4, a1 -; RV32IFD-NEXT: and a1, a4, a2 -; RV32IFD-NEXT: neg a2, a3 -; RV32IFD-NEXT: neg a3, s0 +; RV32IFD-NEXT: neg a1, s0 +; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: or a0, a4, a0 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: or a0, a2, a0 -; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -1320,31 +1320,31 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI21_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI21_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI21_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI21_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI21_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI21_1)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a6 -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a4 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI21_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI21_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI21_0+4)(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI21_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI21_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: lui a4, 524288 ; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: neg a5, a6 +; RV32IZFINXZDINX-NEXT: and a0, a5, a0 +; RV32IZFINXZDINX-NEXT: neg a5, a3 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 ; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: beqz a6, .LBB21_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a3, a1 +; RV32IZFINXZDINX-NEXT: mv a5, a1 ; RV32IZFINXZDINX-NEXT: .LBB21_2: ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a4, .LBB21_4 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB21_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a3, a5, -1 +; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB21_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a3 +; RV32IZFINXZDINX-NEXT: and a1, a2, a5 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1406,15 +1406,15 @@ define i64 @test_rint_ui64(double %x) nounwind { ; RV32IFD-NEXT: call rint ; RV32IFD-NEXT: lui a0, %hi(.LCPI23_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI23_0)(a0) -; RV32IFD-NEXT: flt.d a0, fa5, fa0 -; RV32IFD-NEXT: neg s0, a0 -; RV32IFD-NEXT: fcvt.d.w fa5, zero -; RV32IFD-NEXT: fle.d a0, fa5, fa0 +; RV32IFD-NEXT: fcvt.d.w fa4, zero +; RV32IFD-NEXT: fle.d a0, fa4, fa0 +; RV32IFD-NEXT: flt.d a1, fa5, fa0 +; RV32IFD-NEXT: neg s0, a1 ; RV32IFD-NEXT: neg s1, a0 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 -; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: and a1, s1, a1 +; RV32IFD-NEXT: or a0, s0, a0 ; RV32IFD-NEXT: or a1, s0, a1 ; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1443,16 +1443,16 @@ define i64 @test_rint_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI23_0) +; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI23_0+4)(a4) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI23_0)(a4) -; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: or a0, a3, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: or a1, a3, a1 +; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: or a0, a2, a0 +; RV32IZFINXZDINX-NEXT: or a1, a2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll index 654a4609caa23..e7ff991413013 100644 --- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll @@ -577,9 +577,9 @@ define i32 @select_fcmp_oeq_1_2(double %a, double %b) { ; ; CHECKRV32ZDINX-LABEL: select_fcmp_oeq_1_2: ; CHECKRV32ZDINX: # %bb.0: +; CHECKRV32ZDINX-NEXT: li a4, 2 ; CHECKRV32ZDINX-NEXT: feq.d a0, a0, a2 -; CHECKRV32ZDINX-NEXT: li a1, 2 -; CHECKRV32ZDINX-NEXT: sub a0, a1, a0 +; CHECKRV32ZDINX-NEXT: sub a0, a4, a0 ; CHECKRV32ZDINX-NEXT: ret ; ; CHECKRV64ZDINX-LABEL: select_fcmp_oeq_1_2: diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll index 25228b21ef055..cecdd77a079e4 100644 --- a/llvm/test/CodeGen/RISCV/double_reduct.ll +++ b/llvm/test/CodeGen/RISCV/double_reduct.ll @@ -25,14 +25,14 @@ define float @fmul_f32(<4 x float> %a, <4 x float> %b) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vfmul.vv v8, v8, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 2 +; CHECK-NEXT: vfmul.vv v9, v9, v10 ; CHECK-NEXT: vrgather.vi v10, v8, 1 ; CHECK-NEXT: vfmul.vv v8, v8, v10 +; CHECK-NEXT: vrgather.vi v10, v9, 1 +; CHECK-NEXT: vfmul.vv v9, v9, v10 ; CHECK-NEXT: vfmv.f.s fa5, v8 -; CHECK-NEXT: vslidedown.vi v8, v9, 2 -; CHECK-NEXT: vfmul.vv v8, v9, v8 -; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vfmul.vv v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa4, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: fmul.s fa0, fa5, fa4 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) @@ -130,14 +130,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vslidedown.vi v10, v9, 2 +; RV32-NEXT: vmul.vv v9, v9, v10 ; RV32-NEXT: vrgather.vi v10, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vrgather.vi v10, v9, 1 +; RV32-NEXT: vmul.vv v9, v9, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vslidedown.vi v8, v9, 2 -; RV32-NEXT: vmul.vv v8, v9, v8 -; RV32-NEXT: vrgather.vi v9, v8, 1 -; RV32-NEXT: vmul.vv v8, v8, v9 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: ret ; @@ -146,14 +146,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vslidedown.vi v10, v9, 2 +; RV64-NEXT: vmul.vv v9, v9, v10 ; RV64-NEXT: vrgather.vi v10, v8, 1 ; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: vrgather.vi v10, v9, 1 +; RV64-NEXT: vmul.vv v9, v9, v10 ; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vslidedown.vi v8, v9, 2 -; RV64-NEXT: vmul.vv v8, v9, v8 -; RV64-NEXT: vrgather.vi v9, v8, 1 -; RV64-NEXT: vmul.vv v8, v8, v9 -; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: vmv.x.s a1, v9 ; RV64-NEXT: mulw a0, a0, a1 ; RV64-NEXT: ret %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a) diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 0a3b5d47e5650..26e86d41176e0 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -49,10 +49,10 @@ define void @_Z3foov() { ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: #NO_APP ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_40) +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048572 diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll index 7523119c4ff77..8a91c46bcdaff 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll @@ -280,17 +280,20 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: sw s9, 68(sp) # 4-byte Folded Spill ; ZHINX32-NEXT: sw s10, 64(sp) # 4-byte Folded Spill ; ZHINX32-NEXT: sw s11, 60(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lh t0, 112(sp) -; ZHINX32-NEXT: sh t0, 58(sp) # 2-byte Folded Spill -; ZHINX32-NEXT: lh t0, 116(sp) -; ZHINX32-NEXT: sh t0, 56(sp) # 2-byte Folded Spill -; ZHINX32-NEXT: lh t0, 120(sp) -; ZHINX32-NEXT: sh t0, 54(sp) # 2-byte Folded Spill -; ZHINX32-NEXT: lh t0, 124(sp) -; ZHINX32-NEXT: sh t0, 52(sp) # 2-byte Folded Spill -; ZHINX32-NEXT: lh t6, 128(sp) -; ZHINX32-NEXT: lh t4, 132(sp) -; ZHINX32-NEXT: lh t5, 136(sp) +; ZHINX32-NEXT: sh a7, 58(sp) # 2-byte Folded Spill +; ZHINX32-NEXT: sh a6, 56(sp) # 2-byte Folded Spill +; ZHINX32-NEXT: sh a5, 54(sp) # 2-byte Folded Spill +; ZHINX32-NEXT: sh a4, 52(sp) # 2-byte Folded Spill +; ZHINX32-NEXT: mv a7, a3 +; ZHINX32-NEXT: mv a6, a2 +; ZHINX32-NEXT: mv a5, a1 +; ZHINX32-NEXT: lh t3, 112(sp) +; ZHINX32-NEXT: lh t4, 116(sp) +; ZHINX32-NEXT: lh t5, 120(sp) +; ZHINX32-NEXT: lh t6, 124(sp) +; ZHINX32-NEXT: lh t0, 128(sp) +; ZHINX32-NEXT: lh t1, 132(sp) +; ZHINX32-NEXT: lh t2, 136(sp) ; ZHINX32-NEXT: lh s0, 140(sp) ; ZHINX32-NEXT: lh s1, 144(sp) ; ZHINX32-NEXT: lh s2, 148(sp) @@ -304,14 +307,14 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: lh s10, 180(sp) ; ZHINX32-NEXT: lh s11, 184(sp) ; ZHINX32-NEXT: lh ra, 188(sp) -; ZHINX32-NEXT: lh t0, 192(sp) -; ZHINX32-NEXT: lh t1, 196(sp) -; ZHINX32-NEXT: lh t2, 200(sp) -; ZHINX32-NEXT: lh t3, 204(sp) -; ZHINX32-NEXT: sh t0, 32(sp) -; ZHINX32-NEXT: sh t1, 34(sp) -; ZHINX32-NEXT: sh t2, 36(sp) -; ZHINX32-NEXT: sh t3, 38(sp) +; ZHINX32-NEXT: lh a1, 192(sp) +; ZHINX32-NEXT: lh a2, 196(sp) +; ZHINX32-NEXT: lh a3, 200(sp) +; ZHINX32-NEXT: lh a4, 204(sp) +; ZHINX32-NEXT: sh a1, 32(sp) +; ZHINX32-NEXT: sh a2, 34(sp) +; ZHINX32-NEXT: sh a3, 36(sp) +; ZHINX32-NEXT: sh a4, 38(sp) ; ZHINX32-NEXT: sh s9, 24(sp) ; ZHINX32-NEXT: sh s10, 26(sp) ; ZHINX32-NEXT: sh s11, 28(sp) @@ -324,14 +327,17 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: sh s2, 10(sp) ; ZHINX32-NEXT: sh s3, 12(sp) ; ZHINX32-NEXT: sh s4, 14(sp) -; ZHINX32-NEXT: sh t6, 0(sp) -; ZHINX32-NEXT: sh t4, 2(sp) -; ZHINX32-NEXT: sh t5, 4(sp) +; ZHINX32-NEXT: sh t0, 0(sp) +; ZHINX32-NEXT: sh t1, 2(sp) +; ZHINX32-NEXT: sh t2, 4(sp) ; ZHINX32-NEXT: sh s0, 6(sp) -; ZHINX32-NEXT: lh t3, 58(sp) # 2-byte Folded Reload -; ZHINX32-NEXT: lh t4, 56(sp) # 2-byte Folded Reload -; ZHINX32-NEXT: lh t5, 54(sp) # 2-byte Folded Reload -; ZHINX32-NEXT: lh t6, 52(sp) # 2-byte Folded Reload +; ZHINX32-NEXT: mv a1, a5 +; ZHINX32-NEXT: mv a2, a6 +; ZHINX32-NEXT: mv a3, a7 +; ZHINX32-NEXT: lh a4, 52(sp) # 2-byte Folded Reload +; ZHINX32-NEXT: lh a5, 54(sp) # 2-byte Folded Reload +; ZHINX32-NEXT: lh a6, 56(sp) # 2-byte Folded Reload +; ZHINX32-NEXT: lh a7, 58(sp) # 2-byte Folded Reload ; ZHINX32-NEXT: call callee_half_32 ; ZHINX32-NEXT: lw ra, 108(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: lw s0, 104(sp) # 4-byte Folded Reload @@ -365,17 +371,20 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; ZHINX64-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; ZHINX64-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; ZHINX64-NEXT: lh t0, 160(sp) -; ZHINX64-NEXT: sh t0, 54(sp) # 2-byte Folded Spill -; ZHINX64-NEXT: lh t0, 168(sp) -; ZHINX64-NEXT: sh t0, 52(sp) # 2-byte Folded Spill -; ZHINX64-NEXT: lh t0, 176(sp) -; ZHINX64-NEXT: sh t0, 50(sp) # 2-byte Folded Spill -; ZHINX64-NEXT: lh t0, 184(sp) -; ZHINX64-NEXT: sh t0, 48(sp) # 2-byte Folded Spill -; ZHINX64-NEXT: lh t6, 192(sp) -; ZHINX64-NEXT: lh t4, 200(sp) -; ZHINX64-NEXT: lh t5, 208(sp) +; ZHINX64-NEXT: sh a7, 54(sp) # 2-byte Folded Spill +; ZHINX64-NEXT: sh a6, 52(sp) # 2-byte Folded Spill +; ZHINX64-NEXT: sh a5, 50(sp) # 2-byte Folded Spill +; ZHINX64-NEXT: sh a4, 48(sp) # 2-byte Folded Spill +; ZHINX64-NEXT: mv a7, a3 +; ZHINX64-NEXT: mv a6, a2 +; ZHINX64-NEXT: mv a5, a1 +; ZHINX64-NEXT: lh t3, 160(sp) +; ZHINX64-NEXT: lh t4, 168(sp) +; ZHINX64-NEXT: lh t5, 176(sp) +; ZHINX64-NEXT: lh t6, 184(sp) +; ZHINX64-NEXT: lh t0, 192(sp) +; ZHINX64-NEXT: lh t1, 200(sp) +; ZHINX64-NEXT: lh t2, 208(sp) ; ZHINX64-NEXT: lh s0, 216(sp) ; ZHINX64-NEXT: lh s1, 224(sp) ; ZHINX64-NEXT: lh s2, 232(sp) @@ -389,14 +398,14 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: lh s10, 296(sp) ; ZHINX64-NEXT: lh s11, 304(sp) ; ZHINX64-NEXT: lh ra, 312(sp) -; ZHINX64-NEXT: lh t0, 320(sp) -; ZHINX64-NEXT: lh t1, 328(sp) -; ZHINX64-NEXT: lh t2, 336(sp) -; ZHINX64-NEXT: lh t3, 344(sp) -; ZHINX64-NEXT: sh t0, 32(sp) -; ZHINX64-NEXT: sh t1, 34(sp) -; ZHINX64-NEXT: sh t2, 36(sp) -; ZHINX64-NEXT: sh t3, 38(sp) +; ZHINX64-NEXT: lh a1, 320(sp) +; ZHINX64-NEXT: lh a2, 328(sp) +; ZHINX64-NEXT: lh a3, 336(sp) +; ZHINX64-NEXT: lh a4, 344(sp) +; ZHINX64-NEXT: sh a1, 32(sp) +; ZHINX64-NEXT: sh a2, 34(sp) +; ZHINX64-NEXT: sh a3, 36(sp) +; ZHINX64-NEXT: sh a4, 38(sp) ; ZHINX64-NEXT: sh s9, 24(sp) ; ZHINX64-NEXT: sh s10, 26(sp) ; ZHINX64-NEXT: sh s11, 28(sp) @@ -409,14 +418,17 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: sh s2, 10(sp) ; ZHINX64-NEXT: sh s3, 12(sp) ; ZHINX64-NEXT: sh s4, 14(sp) -; ZHINX64-NEXT: sh t6, 0(sp) -; ZHINX64-NEXT: sh t4, 2(sp) -; ZHINX64-NEXT: sh t5, 4(sp) +; ZHINX64-NEXT: sh t0, 0(sp) +; ZHINX64-NEXT: sh t1, 2(sp) +; ZHINX64-NEXT: sh t2, 4(sp) ; ZHINX64-NEXT: sh s0, 6(sp) -; ZHINX64-NEXT: lh t3, 54(sp) # 2-byte Folded Reload -; ZHINX64-NEXT: lh t4, 52(sp) # 2-byte Folded Reload -; ZHINX64-NEXT: lh t5, 50(sp) # 2-byte Folded Reload -; ZHINX64-NEXT: lh t6, 48(sp) # 2-byte Folded Reload +; ZHINX64-NEXT: mv a1, a5 +; ZHINX64-NEXT: mv a2, a6 +; ZHINX64-NEXT: mv a3, a7 +; ZHINX64-NEXT: lh a4, 48(sp) # 2-byte Folded Reload +; ZHINX64-NEXT: lh a5, 50(sp) # 2-byte Folded Reload +; ZHINX64-NEXT: lh a6, 52(sp) # 2-byte Folded Reload +; ZHINX64-NEXT: lh a7, 54(sp) # 2-byte Folded Reload ; ZHINX64-NEXT: call callee_half_32 ; ZHINX64-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: ld s0, 144(sp) # 8-byte Folded Reload @@ -874,17 +886,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill ; ZHINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill ; ZHINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 160(sp) -; ZHINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 164(sp) -; ZHINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 168(sp) -; ZHINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t0, 172(sp) -; ZHINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZHINX32-NEXT: lw t6, 176(sp) -; ZHINX32-NEXT: lw t4, 180(sp) -; ZHINX32-NEXT: lw t5, 184(sp) +; ZHINX32-NEXT: sw a7, 104(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw a6, 100(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw a5, 96(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: sw a4, 92(sp) # 4-byte Folded Spill +; ZHINX32-NEXT: mv a7, a3 +; ZHINX32-NEXT: mv a6, a2 +; ZHINX32-NEXT: mv a5, a1 +; ZHINX32-NEXT: lw t3, 160(sp) +; ZHINX32-NEXT: lw t4, 164(sp) +; ZHINX32-NEXT: lw t5, 168(sp) +; ZHINX32-NEXT: lw t6, 172(sp) +; ZHINX32-NEXT: lw t0, 176(sp) +; ZHINX32-NEXT: lw t1, 180(sp) +; ZHINX32-NEXT: lw t2, 184(sp) ; ZHINX32-NEXT: lw s0, 188(sp) ; ZHINX32-NEXT: lw s1, 192(sp) ; ZHINX32-NEXT: lw s2, 196(sp) @@ -898,14 +913,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: lw s10, 228(sp) ; ZHINX32-NEXT: lw s11, 232(sp) ; ZHINX32-NEXT: lw ra, 236(sp) -; ZHINX32-NEXT: lw t0, 240(sp) -; ZHINX32-NEXT: lw t1, 244(sp) -; ZHINX32-NEXT: lw t2, 248(sp) -; ZHINX32-NEXT: lw t3, 252(sp) -; ZHINX32-NEXT: sw t0, 64(sp) -; ZHINX32-NEXT: sw t1, 68(sp) -; ZHINX32-NEXT: sw t2, 72(sp) -; ZHINX32-NEXT: sw t3, 76(sp) +; ZHINX32-NEXT: lw a1, 240(sp) +; ZHINX32-NEXT: lw a2, 244(sp) +; ZHINX32-NEXT: lw a3, 248(sp) +; ZHINX32-NEXT: lw a4, 252(sp) +; ZHINX32-NEXT: sw a1, 64(sp) +; ZHINX32-NEXT: sw a2, 68(sp) +; ZHINX32-NEXT: sw a3, 72(sp) +; ZHINX32-NEXT: sw a4, 76(sp) ; ZHINX32-NEXT: sw s9, 48(sp) ; ZHINX32-NEXT: sw s10, 52(sp) ; ZHINX32-NEXT: sw s11, 56(sp) @@ -918,14 +933,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: sw s2, 20(sp) ; ZHINX32-NEXT: sw s3, 24(sp) ; ZHINX32-NEXT: sw s4, 28(sp) -; ZHINX32-NEXT: sw t6, 0(sp) -; ZHINX32-NEXT: sw t4, 4(sp) -; ZHINX32-NEXT: sw t5, 8(sp) +; ZHINX32-NEXT: sw t0, 0(sp) +; ZHINX32-NEXT: sw t1, 4(sp) +; ZHINX32-NEXT: sw t2, 8(sp) ; ZHINX32-NEXT: sw s0, 12(sp) -; ZHINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload -; ZHINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: mv a1, a5 +; ZHINX32-NEXT: mv a2, a6 +; ZHINX32-NEXT: mv a3, a7 +; ZHINX32-NEXT: lw a4, 92(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw a5, 96(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw a6, 100(sp) # 4-byte Folded Reload +; ZHINX32-NEXT: lw a7, 104(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: call callee_float_32 ; ZHINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload ; ZHINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload @@ -959,17 +977,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: sd s9, 120(sp) # 8-byte Folded Spill ; ZHINX64-NEXT: sd s10, 112(sp) # 8-byte Folded Spill ; ZHINX64-NEXT: sd s11, 104(sp) # 8-byte Folded Spill -; ZHINX64-NEXT: lw t0, 208(sp) -; ZHINX64-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZHINX64-NEXT: lw t0, 216(sp) -; ZHINX64-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZHINX64-NEXT: lw t0, 224(sp) -; ZHINX64-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZHINX64-NEXT: lw t0, 232(sp) -; ZHINX64-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZHINX64-NEXT: lw t6, 240(sp) -; ZHINX64-NEXT: lw t4, 248(sp) -; ZHINX64-NEXT: lw t5, 256(sp) +; ZHINX64-NEXT: sw a7, 100(sp) # 4-byte Folded Spill +; ZHINX64-NEXT: sw a6, 96(sp) # 4-byte Folded Spill +; ZHINX64-NEXT: sw a5, 92(sp) # 4-byte Folded Spill +; ZHINX64-NEXT: sw a4, 88(sp) # 4-byte Folded Spill +; ZHINX64-NEXT: mv a7, a3 +; ZHINX64-NEXT: mv a6, a2 +; ZHINX64-NEXT: mv a5, a1 +; ZHINX64-NEXT: lw t3, 208(sp) +; ZHINX64-NEXT: lw t4, 216(sp) +; ZHINX64-NEXT: lw t5, 224(sp) +; ZHINX64-NEXT: lw t6, 232(sp) +; ZHINX64-NEXT: lw t0, 240(sp) +; ZHINX64-NEXT: lw t1, 248(sp) +; ZHINX64-NEXT: lw t2, 256(sp) ; ZHINX64-NEXT: lw s0, 264(sp) ; ZHINX64-NEXT: lw s1, 272(sp) ; ZHINX64-NEXT: lw s2, 280(sp) @@ -983,14 +1004,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: lw s10, 344(sp) ; ZHINX64-NEXT: lw s11, 352(sp) ; ZHINX64-NEXT: lw ra, 360(sp) -; ZHINX64-NEXT: lw t0, 368(sp) -; ZHINX64-NEXT: lw t1, 376(sp) -; ZHINX64-NEXT: lw t2, 384(sp) -; ZHINX64-NEXT: lw t3, 392(sp) -; ZHINX64-NEXT: sw t0, 64(sp) -; ZHINX64-NEXT: sw t1, 68(sp) -; ZHINX64-NEXT: sw t2, 72(sp) -; ZHINX64-NEXT: sw t3, 76(sp) +; ZHINX64-NEXT: lw a1, 368(sp) +; ZHINX64-NEXT: lw a2, 376(sp) +; ZHINX64-NEXT: lw a3, 384(sp) +; ZHINX64-NEXT: lw a4, 392(sp) +; ZHINX64-NEXT: sw a1, 64(sp) +; ZHINX64-NEXT: sw a2, 68(sp) +; ZHINX64-NEXT: sw a3, 72(sp) +; ZHINX64-NEXT: sw a4, 76(sp) ; ZHINX64-NEXT: sw s9, 48(sp) ; ZHINX64-NEXT: sw s10, 52(sp) ; ZHINX64-NEXT: sw s11, 56(sp) @@ -1003,14 +1024,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: sw s2, 20(sp) ; ZHINX64-NEXT: sw s3, 24(sp) ; ZHINX64-NEXT: sw s4, 28(sp) -; ZHINX64-NEXT: sw t6, 0(sp) -; ZHINX64-NEXT: sw t4, 4(sp) -; ZHINX64-NEXT: sw t5, 8(sp) +; ZHINX64-NEXT: sw t0, 0(sp) +; ZHINX64-NEXT: sw t1, 4(sp) +; ZHINX64-NEXT: sw t2, 8(sp) ; ZHINX64-NEXT: sw s0, 12(sp) -; ZHINX64-NEXT: lw t3, 100(sp) # 4-byte Folded Reload -; ZHINX64-NEXT: lw t4, 96(sp) # 4-byte Folded Reload -; ZHINX64-NEXT: lw t5, 92(sp) # 4-byte Folded Reload -; ZHINX64-NEXT: lw t6, 88(sp) # 4-byte Folded Reload +; ZHINX64-NEXT: mv a1, a5 +; ZHINX64-NEXT: mv a2, a6 +; ZHINX64-NEXT: mv a3, a7 +; ZHINX64-NEXT: lw a4, 88(sp) # 4-byte Folded Reload +; ZHINX64-NEXT: lw a5, 92(sp) # 4-byte Folded Reload +; ZHINX64-NEXT: lw a6, 96(sp) # 4-byte Folded Reload +; ZHINX64-NEXT: lw a7, 100(sp) # 4-byte Folded Reload ; ZHINX64-NEXT: call callee_float_32 ; ZHINX64-NEXT: ld ra, 200(sp) # 8-byte Folded Reload ; ZHINX64-NEXT: ld s0, 192(sp) # 8-byte Folded Reload @@ -1044,17 +1068,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill ; ZFINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 160(sp) -; ZFINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 164(sp) -; ZFINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 168(sp) -; ZFINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t0, 172(sp) -; ZFINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZFINX32-NEXT: lw t6, 176(sp) -; ZFINX32-NEXT: lw t4, 180(sp) -; ZFINX32-NEXT: lw t5, 184(sp) +; ZFINX32-NEXT: sw a7, 104(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a6, 100(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a5, 96(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: sw a4, 92(sp) # 4-byte Folded Spill +; ZFINX32-NEXT: mv a7, a3 +; ZFINX32-NEXT: mv a6, a2 +; ZFINX32-NEXT: mv a5, a1 +; ZFINX32-NEXT: lw t3, 160(sp) +; ZFINX32-NEXT: lw t4, 164(sp) +; ZFINX32-NEXT: lw t5, 168(sp) +; ZFINX32-NEXT: lw t6, 172(sp) +; ZFINX32-NEXT: lw t0, 176(sp) +; ZFINX32-NEXT: lw t1, 180(sp) +; ZFINX32-NEXT: lw t2, 184(sp) ; ZFINX32-NEXT: lw s0, 188(sp) ; ZFINX32-NEXT: lw s1, 192(sp) ; ZFINX32-NEXT: lw s2, 196(sp) @@ -1068,14 +1095,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: lw s10, 228(sp) ; ZFINX32-NEXT: lw s11, 232(sp) ; ZFINX32-NEXT: lw ra, 236(sp) -; ZFINX32-NEXT: lw t0, 240(sp) -; ZFINX32-NEXT: lw t1, 244(sp) -; ZFINX32-NEXT: lw t2, 248(sp) -; ZFINX32-NEXT: lw t3, 252(sp) -; ZFINX32-NEXT: sw t0, 64(sp) -; ZFINX32-NEXT: sw t1, 68(sp) -; ZFINX32-NEXT: sw t2, 72(sp) -; ZFINX32-NEXT: sw t3, 76(sp) +; ZFINX32-NEXT: lw a1, 240(sp) +; ZFINX32-NEXT: lw a2, 244(sp) +; ZFINX32-NEXT: lw a3, 248(sp) +; ZFINX32-NEXT: lw a4, 252(sp) +; ZFINX32-NEXT: sw a1, 64(sp) +; ZFINX32-NEXT: sw a2, 68(sp) +; ZFINX32-NEXT: sw a3, 72(sp) +; ZFINX32-NEXT: sw a4, 76(sp) ; ZFINX32-NEXT: sw s9, 48(sp) ; ZFINX32-NEXT: sw s10, 52(sp) ; ZFINX32-NEXT: sw s11, 56(sp) @@ -1088,14 +1115,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: sw s2, 20(sp) ; ZFINX32-NEXT: sw s3, 24(sp) ; ZFINX32-NEXT: sw s4, 28(sp) -; ZFINX32-NEXT: sw t6, 0(sp) -; ZFINX32-NEXT: sw t4, 4(sp) -; ZFINX32-NEXT: sw t5, 8(sp) +; ZFINX32-NEXT: sw t0, 0(sp) +; ZFINX32-NEXT: sw t1, 4(sp) +; ZFINX32-NEXT: sw t2, 8(sp) ; ZFINX32-NEXT: sw s0, 12(sp) -; ZFINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload -; ZFINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: mv a1, a5 +; ZFINX32-NEXT: mv a2, a6 +; ZFINX32-NEXT: mv a3, a7 +; ZFINX32-NEXT: lw a4, 92(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a5, 96(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a6, 100(sp) # 4-byte Folded Reload +; ZFINX32-NEXT: lw a7, 104(sp) # 4-byte Folded Reload ; ZFINX32-NEXT: call callee_float_32 ; ZFINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload ; ZFINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload @@ -1129,17 +1159,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: sd s9, 120(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: sd s10, 112(sp) # 8-byte Folded Spill ; ZFINX64-NEXT: sd s11, 104(sp) # 8-byte Folded Spill -; ZFINX64-NEXT: lw t0, 208(sp) -; ZFINX64-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZFINX64-NEXT: lw t0, 216(sp) -; ZFINX64-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZFINX64-NEXT: lw t0, 224(sp) -; ZFINX64-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZFINX64-NEXT: lw t0, 232(sp) -; ZFINX64-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZFINX64-NEXT: lw t6, 240(sp) -; ZFINX64-NEXT: lw t4, 248(sp) -; ZFINX64-NEXT: lw t5, 256(sp) +; ZFINX64-NEXT: sw a7, 100(sp) # 4-byte Folded Spill +; ZFINX64-NEXT: sw a6, 96(sp) # 4-byte Folded Spill +; ZFINX64-NEXT: sw a5, 92(sp) # 4-byte Folded Spill +; ZFINX64-NEXT: sw a4, 88(sp) # 4-byte Folded Spill +; ZFINX64-NEXT: mv a7, a3 +; ZFINX64-NEXT: mv a6, a2 +; ZFINX64-NEXT: mv a5, a1 +; ZFINX64-NEXT: lw t3, 208(sp) +; ZFINX64-NEXT: lw t4, 216(sp) +; ZFINX64-NEXT: lw t5, 224(sp) +; ZFINX64-NEXT: lw t6, 232(sp) +; ZFINX64-NEXT: lw t0, 240(sp) +; ZFINX64-NEXT: lw t1, 248(sp) +; ZFINX64-NEXT: lw t2, 256(sp) ; ZFINX64-NEXT: lw s0, 264(sp) ; ZFINX64-NEXT: lw s1, 272(sp) ; ZFINX64-NEXT: lw s2, 280(sp) @@ -1153,14 +1186,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: lw s10, 344(sp) ; ZFINX64-NEXT: lw s11, 352(sp) ; ZFINX64-NEXT: lw ra, 360(sp) -; ZFINX64-NEXT: lw t0, 368(sp) -; ZFINX64-NEXT: lw t1, 376(sp) -; ZFINX64-NEXT: lw t2, 384(sp) -; ZFINX64-NEXT: lw t3, 392(sp) -; ZFINX64-NEXT: sw t0, 64(sp) -; ZFINX64-NEXT: sw t1, 68(sp) -; ZFINX64-NEXT: sw t2, 72(sp) -; ZFINX64-NEXT: sw t3, 76(sp) +; ZFINX64-NEXT: lw a1, 368(sp) +; ZFINX64-NEXT: lw a2, 376(sp) +; ZFINX64-NEXT: lw a3, 384(sp) +; ZFINX64-NEXT: lw a4, 392(sp) +; ZFINX64-NEXT: sw a1, 64(sp) +; ZFINX64-NEXT: sw a2, 68(sp) +; ZFINX64-NEXT: sw a3, 72(sp) +; ZFINX64-NEXT: sw a4, 76(sp) ; ZFINX64-NEXT: sw s9, 48(sp) ; ZFINX64-NEXT: sw s10, 52(sp) ; ZFINX64-NEXT: sw s11, 56(sp) @@ -1173,14 +1206,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: sw s2, 20(sp) ; ZFINX64-NEXT: sw s3, 24(sp) ; ZFINX64-NEXT: sw s4, 28(sp) -; ZFINX64-NEXT: sw t6, 0(sp) -; ZFINX64-NEXT: sw t4, 4(sp) -; ZFINX64-NEXT: sw t5, 8(sp) +; ZFINX64-NEXT: sw t0, 0(sp) +; ZFINX64-NEXT: sw t1, 4(sp) +; ZFINX64-NEXT: sw t2, 8(sp) ; ZFINX64-NEXT: sw s0, 12(sp) -; ZFINX64-NEXT: lw t3, 100(sp) # 4-byte Folded Reload -; ZFINX64-NEXT: lw t4, 96(sp) # 4-byte Folded Reload -; ZFINX64-NEXT: lw t5, 92(sp) # 4-byte Folded Reload -; ZFINX64-NEXT: lw t6, 88(sp) # 4-byte Folded Reload +; ZFINX64-NEXT: mv a1, a5 +; ZFINX64-NEXT: mv a2, a6 +; ZFINX64-NEXT: mv a3, a7 +; ZFINX64-NEXT: lw a4, 88(sp) # 4-byte Folded Reload +; ZFINX64-NEXT: lw a5, 92(sp) # 4-byte Folded Reload +; ZFINX64-NEXT: lw a6, 96(sp) # 4-byte Folded Reload +; ZFINX64-NEXT: lw a7, 100(sp) # 4-byte Folded Reload ; ZFINX64-NEXT: call callee_float_32 ; ZFINX64-NEXT: ld ra, 200(sp) # 8-byte Folded Reload ; ZFINX64-NEXT: ld s0, 192(sp) # 8-byte Folded Reload @@ -1214,17 +1250,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill ; ZDINX32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 160(sp) -; ZDINX32-NEXT: sw t0, 104(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 164(sp) -; ZDINX32-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 168(sp) -; ZDINX32-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t0, 172(sp) -; ZDINX32-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZDINX32-NEXT: lw t6, 176(sp) -; ZDINX32-NEXT: lw t4, 180(sp) -; ZDINX32-NEXT: lw t5, 184(sp) +; ZDINX32-NEXT: sw a7, 104(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a6, 100(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a5, 96(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: sw a4, 92(sp) # 4-byte Folded Spill +; ZDINX32-NEXT: mv a7, a3 +; ZDINX32-NEXT: mv a6, a2 +; ZDINX32-NEXT: mv a5, a1 +; ZDINX32-NEXT: lw t3, 160(sp) +; ZDINX32-NEXT: lw t4, 164(sp) +; ZDINX32-NEXT: lw t5, 168(sp) +; ZDINX32-NEXT: lw t6, 172(sp) +; ZDINX32-NEXT: lw t0, 176(sp) +; ZDINX32-NEXT: lw t1, 180(sp) +; ZDINX32-NEXT: lw t2, 184(sp) ; ZDINX32-NEXT: lw s0, 188(sp) ; ZDINX32-NEXT: lw s1, 192(sp) ; ZDINX32-NEXT: lw s2, 196(sp) @@ -1238,14 +1277,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: lw s10, 228(sp) ; ZDINX32-NEXT: lw s11, 232(sp) ; ZDINX32-NEXT: lw ra, 236(sp) -; ZDINX32-NEXT: lw t0, 240(sp) -; ZDINX32-NEXT: lw t1, 244(sp) -; ZDINX32-NEXT: lw t2, 248(sp) -; ZDINX32-NEXT: lw t3, 252(sp) -; ZDINX32-NEXT: sw t0, 64(sp) -; ZDINX32-NEXT: sw t1, 68(sp) -; ZDINX32-NEXT: sw t2, 72(sp) -; ZDINX32-NEXT: sw t3, 76(sp) +; ZDINX32-NEXT: lw a1, 240(sp) +; ZDINX32-NEXT: lw a2, 244(sp) +; ZDINX32-NEXT: lw a3, 248(sp) +; ZDINX32-NEXT: lw a4, 252(sp) +; ZDINX32-NEXT: sw a1, 64(sp) +; ZDINX32-NEXT: sw a2, 68(sp) +; ZDINX32-NEXT: sw a3, 72(sp) +; ZDINX32-NEXT: sw a4, 76(sp) ; ZDINX32-NEXT: sw s9, 48(sp) ; ZDINX32-NEXT: sw s10, 52(sp) ; ZDINX32-NEXT: sw s11, 56(sp) @@ -1258,14 +1297,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: sw s2, 20(sp) ; ZDINX32-NEXT: sw s3, 24(sp) ; ZDINX32-NEXT: sw s4, 28(sp) -; ZDINX32-NEXT: sw t6, 0(sp) -; ZDINX32-NEXT: sw t4, 4(sp) -; ZDINX32-NEXT: sw t5, 8(sp) +; ZDINX32-NEXT: sw t0, 0(sp) +; ZDINX32-NEXT: sw t1, 4(sp) +; ZDINX32-NEXT: sw t2, 8(sp) ; ZDINX32-NEXT: sw s0, 12(sp) -; ZDINX32-NEXT: lw t3, 104(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t4, 100(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t5, 96(sp) # 4-byte Folded Reload -; ZDINX32-NEXT: lw t6, 92(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: mv a1, a5 +; ZDINX32-NEXT: mv a2, a6 +; ZDINX32-NEXT: mv a3, a7 +; ZDINX32-NEXT: lw a4, 92(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a5, 96(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a6, 100(sp) # 4-byte Folded Reload +; ZDINX32-NEXT: lw a7, 104(sp) # 4-byte Folded Reload ; ZDINX32-NEXT: call callee_float_32 ; ZDINX32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload ; ZDINX32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload @@ -1299,17 +1341,20 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: sd s9, 120(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: sd s10, 112(sp) # 8-byte Folded Spill ; ZDINX64-NEXT: sd s11, 104(sp) # 8-byte Folded Spill -; ZDINX64-NEXT: lw t0, 208(sp) -; ZDINX64-NEXT: sw t0, 100(sp) # 4-byte Folded Spill -; ZDINX64-NEXT: lw t0, 216(sp) -; ZDINX64-NEXT: sw t0, 96(sp) # 4-byte Folded Spill -; ZDINX64-NEXT: lw t0, 224(sp) -; ZDINX64-NEXT: sw t0, 92(sp) # 4-byte Folded Spill -; ZDINX64-NEXT: lw t0, 232(sp) -; ZDINX64-NEXT: sw t0, 88(sp) # 4-byte Folded Spill -; ZDINX64-NEXT: lw t6, 240(sp) -; ZDINX64-NEXT: lw t4, 248(sp) -; ZDINX64-NEXT: lw t5, 256(sp) +; ZDINX64-NEXT: sw a7, 100(sp) # 4-byte Folded Spill +; ZDINX64-NEXT: sw a6, 96(sp) # 4-byte Folded Spill +; ZDINX64-NEXT: sw a5, 92(sp) # 4-byte Folded Spill +; ZDINX64-NEXT: sw a4, 88(sp) # 4-byte Folded Spill +; ZDINX64-NEXT: mv a7, a3 +; ZDINX64-NEXT: mv a6, a2 +; ZDINX64-NEXT: mv a5, a1 +; ZDINX64-NEXT: lw t3, 208(sp) +; ZDINX64-NEXT: lw t4, 216(sp) +; ZDINX64-NEXT: lw t5, 224(sp) +; ZDINX64-NEXT: lw t6, 232(sp) +; ZDINX64-NEXT: lw t0, 240(sp) +; ZDINX64-NEXT: lw t1, 248(sp) +; ZDINX64-NEXT: lw t2, 256(sp) ; ZDINX64-NEXT: lw s0, 264(sp) ; ZDINX64-NEXT: lw s1, 272(sp) ; ZDINX64-NEXT: lw s2, 280(sp) @@ -1323,14 +1368,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: lw s10, 344(sp) ; ZDINX64-NEXT: lw s11, 352(sp) ; ZDINX64-NEXT: lw ra, 360(sp) -; ZDINX64-NEXT: lw t0, 368(sp) -; ZDINX64-NEXT: lw t1, 376(sp) -; ZDINX64-NEXT: lw t2, 384(sp) -; ZDINX64-NEXT: lw t3, 392(sp) -; ZDINX64-NEXT: sw t0, 64(sp) -; ZDINX64-NEXT: sw t1, 68(sp) -; ZDINX64-NEXT: sw t2, 72(sp) -; ZDINX64-NEXT: sw t3, 76(sp) +; ZDINX64-NEXT: lw a1, 368(sp) +; ZDINX64-NEXT: lw a2, 376(sp) +; ZDINX64-NEXT: lw a3, 384(sp) +; ZDINX64-NEXT: lw a4, 392(sp) +; ZDINX64-NEXT: sw a1, 64(sp) +; ZDINX64-NEXT: sw a2, 68(sp) +; ZDINX64-NEXT: sw a3, 72(sp) +; ZDINX64-NEXT: sw a4, 76(sp) ; ZDINX64-NEXT: sw s9, 48(sp) ; ZDINX64-NEXT: sw s10, 52(sp) ; ZDINX64-NEXT: sw s11, 56(sp) @@ -1343,14 +1388,17 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: sw s2, 20(sp) ; ZDINX64-NEXT: sw s3, 24(sp) ; ZDINX64-NEXT: sw s4, 28(sp) -; ZDINX64-NEXT: sw t6, 0(sp) -; ZDINX64-NEXT: sw t4, 4(sp) -; ZDINX64-NEXT: sw t5, 8(sp) +; ZDINX64-NEXT: sw t0, 0(sp) +; ZDINX64-NEXT: sw t1, 4(sp) +; ZDINX64-NEXT: sw t2, 8(sp) ; ZDINX64-NEXT: sw s0, 12(sp) -; ZDINX64-NEXT: lw t3, 100(sp) # 4-byte Folded Reload -; ZDINX64-NEXT: lw t4, 96(sp) # 4-byte Folded Reload -; ZDINX64-NEXT: lw t5, 92(sp) # 4-byte Folded Reload -; ZDINX64-NEXT: lw t6, 88(sp) # 4-byte Folded Reload +; ZDINX64-NEXT: mv a1, a5 +; ZDINX64-NEXT: mv a2, a6 +; ZDINX64-NEXT: mv a3, a7 +; ZDINX64-NEXT: lw a4, 88(sp) # 4-byte Folded Reload +; ZDINX64-NEXT: lw a5, 92(sp) # 4-byte Folded Reload +; ZDINX64-NEXT: lw a6, 96(sp) # 4-byte Folded Reload +; ZDINX64-NEXT: lw a7, 100(sp) # 4-byte Folded Reload ; ZDINX64-NEXT: call callee_float_32 ; ZDINX64-NEXT: ld ra, 200(sp) # 8-byte Folded Reload ; ZDINX64-NEXT: ld s0, 192(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll index bf500d1a2adb3..57b3423da69a6 100644 --- a/llvm/test/CodeGen/RISCV/float-arith.ll +++ b/llvm/test/CodeGen/RISCV/float-arith.ll @@ -195,8 +195,8 @@ define float @fsgnj_s(float %a, float %b) nounwind { ; RV32I-LABEL: fsgnj_s: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -204,8 +204,8 @@ define float @fsgnj_s(float %a, float %b) nounwind { ; RV64I-LABEL: fsgnj_s: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 524288 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 33 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -284,8 +284,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind { ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli s0, s0, 1 +; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: srli s0, s0, 1 ; RV32I-NEXT: or a0, s0, a0 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -302,8 +302,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind { ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli s0, s0, 33 +; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: srli s0, s0, 33 ; RV64I-NEXT: or a0, s0, a0 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll index 86f6f079243c2..aaeb1b7c0b1fb 100644 --- a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll +++ b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll @@ -107,8 +107,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -131,8 +131,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: not a1, a1 ; RV64I-NEXT: lui a2, 524288 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 33 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 031976b4fa2b2..fc866d71a3a70 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -629,7 +629,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IF-NEXT: fmv.w.x fa5, a0 ; RV32IF-NEXT: fle.s s0, fa5, fa0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB12_2 ; RV32IF-NEXT: # %bb.1: # %start @@ -637,19 +637,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IF-NEXT: .LBB12_2: # %start ; RV32IF-NEXT: lui a1, %hi(.LCPI12_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB12_4 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB12_4 ; RV32IF-NEXT: # %bb.3: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB12_4: # %start -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -676,7 +676,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB12_2 ; RV32IZFINX-NEXT: # %bb.1: # %start @@ -684,19 +684,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IZFINX-NEXT: .LBB12_2: # %start ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB12_4 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB12_4 ; RV32IZFINX-NEXT: # %bb.3: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB12_4: # %start -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -722,40 +722,40 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a1, 913408 ; RV32I-NEXT: call __gesf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __fixsfdi ; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __fixsfdi +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv s3, a1 ; RV32I-NEXT: lui s5, 524288 -; RV32I-NEXT: bgez s1, .LBB12_2 +; RV32I-NEXT: bgez s2, .LBB12_2 ; RV32I-NEXT: # %bb.1: # %start ; RV32I-NEXT: lui s3, 524288 ; RV32I-NEXT: .LBB12_2: # %start ; RV32I-NEXT: lui a1, 389120 ; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __gtsf2 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: blez a0, .LBB12_4 ; RV32I-NEXT: # %bb.3: # %start ; RV32I-NEXT: addi s3, s5, -1 ; RV32I-NEXT: .LBB12_4: # %start -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __unordsf2 ; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: slti a1, s2, 0 +; RV32I-NEXT: sgtz a2, s4 ; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: addi a3, a1, -1 ; RV32I-NEXT: and a1, a0, s3 -; RV32I-NEXT: slti a2, s1, 0 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a2, a2, s2 -; RV32I-NEXT: sgtz a3, s4 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: and a3, a3, s0 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -879,10 +879,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI14_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -912,11 +912,11 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -955,10 +955,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind { ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __gtsf2 ; RV32I-NEXT: sgtz a0, a0 -; RV32I-NEXT: neg a1, a0 -; RV32I-NEXT: or a0, a1, s3 -; RV32I-NEXT: and a2, s2, s1 -; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: and a1, s2, s1 +; RV32I-NEXT: neg a2, a0 +; RV32I-NEXT: or a0, a2, s3 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1417,12 +1417,12 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV32IF-LABEL: fcvt_w_s_sat_i16: ; RV32IF: # %bb.0: # %start ; RV32IF-NEXT: feq.s a0, fa0, fa0 -; RV32IF-NEXT: neg a0, a0 ; RV32IF-NEXT: lui a1, %hi(.LCPI24_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV32IF-NEXT: lui a1, 815104 ; RV32IF-NEXT: fmv.w.x fa4, a1 ; RV32IF-NEXT: fmax.s fa4, fa0, fa4 +; RV32IF-NEXT: neg a0, a0 ; RV32IF-NEXT: fmin.s fa5, fa4, fa5 ; RV32IF-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IF-NEXT: and a0, a0, a1 @@ -1431,12 +1431,12 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV64IF-LABEL: fcvt_w_s_sat_i16: ; RV64IF: # %bb.0: # %start ; RV64IF-NEXT: feq.s a0, fa0, fa0 -; RV64IF-NEXT: neg a0, a0 ; RV64IF-NEXT: lui a1, %hi(.LCPI24_0) ; RV64IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV64IF-NEXT: lui a1, 815104 ; RV64IF-NEXT: fmv.w.x fa4, a1 ; RV64IF-NEXT: fmax.s fa4, fa0, fa4 +; RV64IF-NEXT: neg a0, a0 ; RV64IF-NEXT: fmin.s fa5, fa4, fa5 ; RV64IF-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IF-NEXT: and a0, a0, a1 @@ -1445,10 +1445,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV32IZFINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZFINX: # %bb.0: # %start ; RV32IZFINX-NEXT: feq.s a1, a0, a0 -; RV32IZFINX-NEXT: neg a1, a1 ; RV32IZFINX-NEXT: lui a2, 815104 ; RV32IZFINX-NEXT: fmax.s a0, a0, a2 ; RV32IZFINX-NEXT: lui a2, 290816 +; RV32IZFINX-NEXT: neg a1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -512 ; RV32IZFINX-NEXT: fmin.s a0, a0, a2 ; RV32IZFINX-NEXT: fcvt.w.s a0, a0, rtz @@ -1458,10 +1458,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV64IZFINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINX: # %bb.0: # %start ; RV64IZFINX-NEXT: feq.s a1, a0, a0 -; RV64IZFINX-NEXT: neg a1, a1 ; RV64IZFINX-NEXT: lui a2, 815104 ; RV64IZFINX-NEXT: fmax.s a0, a0, a2 ; RV64IZFINX-NEXT: lui a2, 290816 +; RV64IZFINX-NEXT: neg a1, a1 ; RV64IZFINX-NEXT: addiw a2, a2, -512 ; RV64IZFINX-NEXT: fmin.s a0, a0, a2 ; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rtz @@ -1763,11 +1763,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind { ; RV32IF-LABEL: fcvt_w_s_sat_i8: ; RV32IF: # %bb.0: # %start ; RV32IF-NEXT: feq.s a0, fa0, fa0 -; RV32IF-NEXT: neg a0, a0 ; RV32IF-NEXT: lui a1, 798720 ; RV32IF-NEXT: fmv.w.x fa5, a1 -; RV32IF-NEXT: fmax.s fa5, fa0, fa5 ; RV32IF-NEXT: lui a1, 274400 +; RV32IF-NEXT: neg a0, a0 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 ; RV32IF-NEXT: fmv.w.x fa4, a1 ; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.w.s a1, fa5, rtz @@ -1777,11 +1777,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind { ; RV64IF-LABEL: fcvt_w_s_sat_i8: ; RV64IF: # %bb.0: # %start ; RV64IF-NEXT: feq.s a0, fa0, fa0 -; RV64IF-NEXT: neg a0, a0 ; RV64IF-NEXT: lui a1, 798720 ; RV64IF-NEXT: fmv.w.x fa5, a1 -; RV64IF-NEXT: fmax.s fa5, fa0, fa5 ; RV64IF-NEXT: lui a1, 274400 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 ; RV64IF-NEXT: fmv.w.x fa4, a1 ; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.l.s a1, fa5, rtz @@ -1791,8 +1791,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind { ; RV32IZFINX-LABEL: fcvt_w_s_sat_i8: ; RV32IZFINX: # %bb.0: # %start ; RV32IZFINX-NEXT: feq.s a1, a0, a0 -; RV32IZFINX-NEXT: neg a1, a1 ; RV32IZFINX-NEXT: lui a2, 798720 +; RV32IZFINX-NEXT: neg a1, a1 ; RV32IZFINX-NEXT: fmax.s a0, a0, a2 ; RV32IZFINX-NEXT: lui a2, 274400 ; RV32IZFINX-NEXT: fmin.s a0, a0, a2 @@ -1803,8 +1803,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind { ; RV64IZFINX-LABEL: fcvt_w_s_sat_i8: ; RV64IZFINX: # %bb.0: # %start ; RV64IZFINX-NEXT: feq.s a1, a0, a0 -; RV64IZFINX-NEXT: neg a1, a1 ; RV64IZFINX-NEXT: lui a2, 798720 +; RV64IZFINX-NEXT: neg a1, a1 ; RV64IZFINX-NEXT: fmax.s a0, a0, a2 ; RV64IZFINX-NEXT: lui a2, 274400 ; RV64IZFINX-NEXT: fmin.s a0, a0, a2 @@ -1943,8 +1943,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind { ; RV32IF-LABEL: fcvt_wu_s_sat_i8: ; RV32IF: # %bb.0: # %start ; RV32IF-NEXT: fmv.w.x fa5, zero -; RV32IF-NEXT: fmax.s fa5, fa0, fa5 ; RV32IF-NEXT: lui a0, 276464 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 ; RV32IF-NEXT: fmv.w.x fa4, a0 ; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.wu.s a0, fa5, rtz @@ -1953,8 +1953,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind { ; RV64IF-LABEL: fcvt_wu_s_sat_i8: ; RV64IF: # %bb.0: # %start ; RV64IF-NEXT: fmv.w.x fa5, zero -; RV64IF-NEXT: fmax.s fa5, fa0, fa5 ; RV64IF-NEXT: lui a0, 276464 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 ; RV64IF-NEXT: fmv.w.x fa4, a0 ; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.lu.s a0, fa5, rtz diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll index e154f3361a121..37381aeeb2a0f 100644 --- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll @@ -807,8 +807,8 @@ define float @copysign_f32(float %a, float %b) nounwind { ; RV32I-LABEL: copysign_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -816,8 +816,8 @@ define float @copysign_f32(float %a, float %b) nounwind { ; RV64I-LABEL: copysign_f32: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 524288 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 33 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -1603,54 +1603,54 @@ define i1 @fpclass(float %x) { ; RV32I-LABEL: fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 1 -; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: addi a2, a1, -1 -; RV32I-NEXT: lui a3, 2048 -; RV32I-NEXT: addi a3, a3, -1 -; RV32I-NEXT: sltu a2, a2, a3 +; RV32I-NEXT: lui a2, 2048 ; RV32I-NEXT: slti a0, a0, 0 -; RV32I-NEXT: and a2, a2, a0 -; RV32I-NEXT: seqz a3, a1 -; RV32I-NEXT: lui a4, 522240 -; RV32I-NEXT: xor a5, a1, a4 +; RV32I-NEXT: lui a3, 522240 +; RV32I-NEXT: lui a4, 1046528 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: addi a5, a1, -1 +; RV32I-NEXT: sltu a2, a5, a2 +; RV32I-NEXT: xor a5, a1, a3 +; RV32I-NEXT: slt a3, a3, a1 +; RV32I-NEXT: add a4, a1, a4 +; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: seqz a5, a5 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: slt a3, a4, a1 -; RV32I-NEXT: or a2, a2, a3 -; RV32I-NEXT: lui a3, 1046528 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sltiu a1, a1, 127 -; RV32I-NEXT: and a0, a1, a0 -; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: and a2, a2, a0 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: sltiu a4, a4, 127 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: and a0, a4, a0 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV64I-LABEL: fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: slli a0, a0, 33 +; RV64I-NEXT: lui a2, 2048 +; RV64I-NEXT: lui a3, 522240 +; RV64I-NEXT: lui a4, 1046528 ; RV64I-NEXT: srli a0, a0, 33 -; RV64I-NEXT: addi a2, a0, -1 -; RV64I-NEXT: lui a3, 2048 -; RV64I-NEXT: addiw a3, a3, -1 -; RV64I-NEXT: sltu a2, a2, a3 +; RV64I-NEXT: addiw a2, a2, -1 ; RV64I-NEXT: slti a1, a1, 0 -; RV64I-NEXT: and a2, a2, a1 -; RV64I-NEXT: seqz a3, a0 -; RV64I-NEXT: lui a4, 522240 -; RV64I-NEXT: xor a5, a0, a4 +; RV64I-NEXT: addi a5, a0, -1 +; RV64I-NEXT: sltu a2, a5, a2 +; RV64I-NEXT: xor a5, a0, a3 +; RV64I-NEXT: slt a3, a3, a0 +; RV64I-NEXT: add a4, a0, a4 +; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: seqz a5, a5 -; RV64I-NEXT: or a3, a3, a5 -; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: slt a3, a4, a0 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: lui a3, 1046528 -; RV64I-NEXT: add a0, a0, a3 -; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: sltiu a0, a0, 127 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: srliw a4, a4, 24 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: sltiu a4, a4, 127 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret %cmp = call i1 @llvm.is.fpclass.f32(float %x, i32 639) ret i1 %cmp @@ -1732,8 +1732,8 @@ define i1 @isqnan_fpclass(float %x) { ; RV32I-LABEL: isqnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: lui a1, 523264 +; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: slt a0, a1, a0 ; RV32I-NEXT: ret @@ -1741,8 +1741,8 @@ define i1 @isqnan_fpclass(float %x) { ; RV64I-LABEL: isqnan_fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: lui a1, 523264 +; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: addiw a1, a1, -1 ; RV64I-NEXT: slt a0, a1, a0 ; RV64I-NEXT: ret @@ -1782,10 +1782,10 @@ define i1 @issnan_fpclass(float %x) { ; RV32I-LABEL: issnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: lui a1, 523264 -; RV32I-NEXT: slt a1, a0, a1 ; RV32I-NEXT: lui a2, 522240 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: slt a1, a0, a1 ; RV32I-NEXT: slt a0, a2, a0 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: ret @@ -1793,10 +1793,10 @@ define i1 @issnan_fpclass(float %x) { ; RV64I-LABEL: issnan_fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: lui a1, 523264 -; RV64I-NEXT: slt a1, a0, a1 ; RV64I-NEXT: lui a2, 522240 +; RV64I-NEXT: srli a0, a0, 33 +; RV64I-NEXT: slt a1, a0, a1 ; RV64I-NEXT: slt a0, a2, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: ret @@ -2068,8 +2068,8 @@ define i1 @isnegfinite_fpclass(float %x) { ; RV32I-LABEL: isnegfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 1 -; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: lui a2, 522240 +; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: slt a1, a1, a2 ; RV32I-NEXT: slti a0, a0, 0 ; RV32I-NEXT: and a0, a1, a0 @@ -2079,8 +2079,8 @@ define i1 @isnegfinite_fpclass(float %x) { ; RV64I: # %bb.0: ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: lui a2, 522240 +; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: slt a0, a0, a2 ; RV64I-NEXT: slti a1, a1, 0 ; RV64I-NEXT: and a0, a0, a1 @@ -2121,8 +2121,8 @@ define i1 @isnotfinite_fpclass(float %x) { ; RV32I-LABEL: isnotfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: lui a1, 522240 +; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: slt a0, a1, a0 ; RV32I-NEXT: ret @@ -2130,8 +2130,8 @@ define i1 @isnotfinite_fpclass(float %x) { ; RV64I-LABEL: isnotfinite_fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: lui a1, 522240 +; RV64I-NEXT: srli a0, a0, 33 ; RV64I-NEXT: addiw a1, a1, -1 ; RV64I-NEXT: slt a0, a1, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll index 198b18c75272a..809cc31abe612 100644 --- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll @@ -54,7 +54,7 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB1_4 ; RV32IF-NEXT: # %bb.3: @@ -62,19 +62,19 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB1_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI1_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB1_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB1_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB1_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -110,7 +110,7 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB1_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -118,19 +118,19 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB1_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB1_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB1_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB1_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -199,10 +199,10 @@ define i64 @test_floor_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI3_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI3_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -241,11 +241,11 @@ define i64 @test_floor_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -312,7 +312,7 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB5_4 ; RV32IF-NEXT: # %bb.3: @@ -320,19 +320,19 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB5_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI5_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB5_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB5_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB5_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -368,7 +368,7 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB5_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -376,19 +376,19 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB5_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB5_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB5_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB5_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -457,10 +457,10 @@ define i64 @test_ceil_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI7_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI7_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -499,11 +499,11 @@ define i64 @test_ceil_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -570,7 +570,7 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB9_4 ; RV32IF-NEXT: # %bb.3: @@ -578,19 +578,19 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB9_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI9_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB9_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB9_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB9_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -626,7 +626,7 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB9_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -634,19 +634,19 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB9_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB9_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB9_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB9_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -715,10 +715,10 @@ define i64 @test_trunc_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI11_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI11_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -757,11 +757,11 @@ define i64 @test_trunc_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -828,7 +828,7 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB13_4 ; RV32IF-NEXT: # %bb.3: @@ -836,19 +836,19 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB13_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI13_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB13_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB13_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB13_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -884,7 +884,7 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB13_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -892,19 +892,19 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB13_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB13_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB13_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB13_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -973,10 +973,10 @@ define i64 @test_round_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI15_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI15_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1015,11 +1015,11 @@ define i64 @test_round_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1086,7 +1086,7 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB17_4 ; RV32IF-NEXT: # %bb.3: @@ -1094,19 +1094,19 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB17_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI17_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB17_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB17_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB17_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -1142,7 +1142,7 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB17_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -1150,19 +1150,19 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB17_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB17_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB17_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB17_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1231,10 +1231,10 @@ define i64 @test_roundeven_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI19_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI19_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1273,11 +1273,11 @@ define i64 @test_roundeven_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1344,7 +1344,7 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IF-NEXT: fle.s s0, fa5, fs0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixsfdi -; RV32IF-NEXT: lui a4, 524288 +; RV32IF-NEXT: lui a3, 524288 ; RV32IF-NEXT: lui a2, 524288 ; RV32IF-NEXT: beqz s0, .LBB21_4 ; RV32IF-NEXT: # %bb.3: @@ -1352,19 +1352,19 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IF-NEXT: .LBB21_4: ; RV32IF-NEXT: lui a1, %hi(.LCPI21_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a1) -; RV32IF-NEXT: flt.s a3, fa5, fs0 -; RV32IF-NEXT: beqz a3, .LBB21_6 +; RV32IF-NEXT: flt.s a1, fa5, fs0 +; RV32IF-NEXT: beqz a1, .LBB21_6 ; RV32IF-NEXT: # %bb.5: -; RV32IF-NEXT: addi a2, a4, -1 +; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB21_6: -; RV32IF-NEXT: feq.s a1, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: and a1, a4, a2 -; RV32IF-NEXT: neg a2, s0 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: neg a2, a3 -; RV32IF-NEXT: or a0, a2, a0 +; RV32IF-NEXT: feq.s a3, fs0, fs0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 +; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a0, a4, a0 +; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: or a0, a5, a0 +; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -1400,7 +1400,7 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IZFINX-NEXT: fle.s s1, a0, s0 ; RV32IZFINX-NEXT: mv a0, s0 ; RV32IZFINX-NEXT: call __fixsfdi -; RV32IZFINX-NEXT: lui a4, 524288 +; RV32IZFINX-NEXT: lui a3, 524288 ; RV32IZFINX-NEXT: lui a2, 524288 ; RV32IZFINX-NEXT: beqz s1, .LBB21_4 ; RV32IZFINX-NEXT: # %bb.3: @@ -1408,19 +1408,19 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IZFINX-NEXT: .LBB21_4: ; RV32IZFINX-NEXT: lui a1, 389120 ; RV32IZFINX-NEXT: addi a1, a1, -1 -; RV32IZFINX-NEXT: flt.s a3, a1, s0 -; RV32IZFINX-NEXT: beqz a3, .LBB21_6 +; RV32IZFINX-NEXT: flt.s a1, a1, s0 +; RV32IZFINX-NEXT: beqz a1, .LBB21_6 ; RV32IZFINX-NEXT: # %bb.5: -; RV32IZFINX-NEXT: addi a2, a4, -1 +; RV32IZFINX-NEXT: addi a2, a3, -1 ; RV32IZFINX-NEXT: .LBB21_6: -; RV32IZFINX-NEXT: feq.s a1, s0, s0 -; RV32IZFINX-NEXT: neg a4, a1 -; RV32IZFINX-NEXT: and a1, a4, a2 -; RV32IZFINX-NEXT: neg a2, s1 -; RV32IZFINX-NEXT: and a0, a2, a0 -; RV32IZFINX-NEXT: neg a2, a3 -; RV32IZFINX-NEXT: or a0, a2, a0 +; RV32IZFINX-NEXT: feq.s a3, s0, s0 +; RV32IZFINX-NEXT: neg a4, s1 +; RV32IZFINX-NEXT: neg a5, a1 +; RV32IZFINX-NEXT: neg a3, a3 ; RV32IZFINX-NEXT: and a0, a4, a0 +; RV32IZFINX-NEXT: and a1, a3, a2 +; RV32IZFINX-NEXT: or a0, a5, a0 +; RV32IZFINX-NEXT: and a0, a3, a0 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1489,10 +1489,10 @@ define i64 @test_rint_ui64(float %x) nounwind { ; RV32IF-NEXT: lui a2, %hi(.LCPI23_0) ; RV32IF-NEXT: flw fa5, %lo(.LCPI23_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 -; RV32IF-NEXT: and a1, s0, a1 ; RV32IF-NEXT: or a1, a2, a1 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1531,11 +1531,11 @@ define i64 @test_rint_ui64(float %x) nounwind { ; RV32IZFINX-NEXT: call __fixunssfdi ; RV32IZFINX-NEXT: and a0, s1, a0 ; RV32IZFINX-NEXT: lui a2, 391168 +; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: addi a2, a2, -1 ; RV32IZFINX-NEXT: flt.s a2, a2, s0 ; RV32IZFINX-NEXT: neg a2, a2 ; RV32IZFINX-NEXT: or a0, a2, a0 -; RV32IZFINX-NEXT: and a1, s1, a1 ; RV32IZFINX-NEXT: or a1, a2, a1 ; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index a204b92830412..b8dc7804c4908 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -929,19 +929,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call f -; RV32I-NEXT: lw a0, 12(s7) -; RV32I-NEXT: lw a1, 8(s7) -; RV32I-NEXT: add a0, a0, s4 -; RV32I-NEXT: add s3, a1, s3 -; RV32I-NEXT: sltu s4, s3, a1 +; RV32I-NEXT: lw a0, 8(s7) +; RV32I-NEXT: lw a1, 12(s7) ; RV32I-NEXT: addi s5, s5, 1 -; RV32I-NEXT: seqz a1, s5 -; RV32I-NEXT: add s6, s6, a1 -; RV32I-NEXT: xor a1, s5, s2 -; RV32I-NEXT: xor a2, s6, s1 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: add s4, a0, s4 -; RV32I-NEXT: bnez a1, .LBB20_5 +; RV32I-NEXT: seqz a2, s5 +; RV32I-NEXT: add s6, s6, a2 +; RV32I-NEXT: xor a2, s5, s2 +; RV32I-NEXT: add a1, a1, s4 +; RV32I-NEXT: xor a3, s6, s1 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: add s3, a0, s3 +; RV32I-NEXT: sltu s4, s3, a0 +; RV32I-NEXT: add s4, a1, s4 +; RV32I-NEXT: bnez a2, .LBB20_5 ; RV32I-NEXT: .LBB20_6: # %for.cond.cleanup ; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a1, s4 @@ -994,19 +994,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-MEDIUM-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-MEDIUM-NEXT: mv a0, s0 ; RV32I-MEDIUM-NEXT: call f -; RV32I-MEDIUM-NEXT: lw a0, 12(s7) -; RV32I-MEDIUM-NEXT: lw a1, 8(s7) -; RV32I-MEDIUM-NEXT: add a0, a0, s4 -; RV32I-MEDIUM-NEXT: add s3, a1, s3 -; RV32I-MEDIUM-NEXT: sltu s4, s3, a1 +; RV32I-MEDIUM-NEXT: lw a0, 8(s7) +; RV32I-MEDIUM-NEXT: lw a1, 12(s7) ; RV32I-MEDIUM-NEXT: addi s5, s5, 1 -; RV32I-MEDIUM-NEXT: seqz a1, s5 -; RV32I-MEDIUM-NEXT: add s6, s6, a1 -; RV32I-MEDIUM-NEXT: xor a1, s5, s2 -; RV32I-MEDIUM-NEXT: xor a2, s6, s1 -; RV32I-MEDIUM-NEXT: or a1, a1, a2 -; RV32I-MEDIUM-NEXT: add s4, a0, s4 -; RV32I-MEDIUM-NEXT: bnez a1, .LBB20_5 +; RV32I-MEDIUM-NEXT: seqz a2, s5 +; RV32I-MEDIUM-NEXT: add s6, s6, a2 +; RV32I-MEDIUM-NEXT: xor a2, s5, s2 +; RV32I-MEDIUM-NEXT: add a1, a1, s4 +; RV32I-MEDIUM-NEXT: xor a3, s6, s1 +; RV32I-MEDIUM-NEXT: or a2, a2, a3 +; RV32I-MEDIUM-NEXT: add s3, a0, s3 +; RV32I-MEDIUM-NEXT: sltu s4, s3, a0 +; RV32I-MEDIUM-NEXT: add s4, a1, s4 +; RV32I-MEDIUM-NEXT: bnez a2, .LBB20_5 ; RV32I-MEDIUM-NEXT: .LBB20_6: # %for.cond.cleanup ; RV32I-MEDIUM-NEXT: mv a0, s3 ; RV32I-MEDIUM-NEXT: mv a1, s4 @@ -1107,10 +1107,10 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-LARGE-NEXT: # %bb.1: # %for.body.lr.ph ; RV64I-LARGE-NEXT: mv s0, a2 ; RV64I-LARGE-NEXT: mv s1, a1 +; RV64I-LARGE-NEXT: li s2, 0 ; RV64I-LARGE-NEXT: .Lpcrel_hi14: ; RV64I-LARGE-NEXT: auipc a1, %pcrel_hi(.LCPI20_0) ; RV64I-LARGE-NEXT: ld s3, %pcrel_lo(.Lpcrel_hi14)(a1) -; RV64I-LARGE-NEXT: li s2, 0 ; RV64I-LARGE-NEXT: slli a0, a0, 4 ; RV64I-LARGE-NEXT: add s4, a2, a0 ; RV64I-LARGE-NEXT: .LBB20_2: # %for.body diff --git a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll index 1512db87b9311..2036e7c7adfa8 100644 --- a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll +++ b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll @@ -33,8 +33,8 @@ define i64 @fold_binop_into_select_2(i1 %c, i64 %x) { ; CHECK-LABEL: fold_binop_into_select_2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 2 -; CHECK-NEXT: sub a2, a2, a1 ; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: sub a2, a2, a1 ; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: and a0, a0, a2 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll index 133d735a46ed7..e7719dc70660b 100644 --- a/llvm/test/CodeGen/RISCV/forced-atomics.ll +++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll @@ -4531,29 +4531,29 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a1 -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: lw a2, 4(s0) -; RV32-NEXT: lw a3, 8(s0) -; RV32-NEXT: lw a4, 12(s0) +; RV32-NEXT: lw a4, 0(a1) +; RV32-NEXT: lw a3, 4(a1) +; RV32-NEXT: lw a1, 8(a1) +; RV32-NEXT: lw a2, 12(s0) ; RV32-NEXT: mv s1, a0 ; RV32-NEXT: .LBB62_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: addi a0, a1, 1 -; RV32-NEXT: seqz a5, a0 -; RV32-NEXT: add a5, a2, a5 -; RV32-NEXT: or a6, a0, a5 -; RV32-NEXT: seqz a6, a6 -; RV32-NEXT: add a6, a3, a6 -; RV32-NEXT: sltu a7, a6, a3 -; RV32-NEXT: add a7, a4, a7 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a2, 20(sp) -; RV32-NEXT: sw a3, 24(sp) -; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: addi a0, a4, 1 +; RV32-NEXT: sw a4, 16(sp) +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a2, 28(sp) +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: or a4, a0, a3 +; RV32-NEXT: seqz a4, a4 +; RV32-NEXT: add a4, a1, a4 +; RV32-NEXT: sltu a1, a4, a1 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: sw a5, 4(sp) -; RV32-NEXT: sw a6, 8(sp) -; RV32-NEXT: sw a7, 12(sp) +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: li a0, 16 ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: mv a3, sp @@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a1, s0 ; RV32-NEXT: call __atomic_compare_exchange -; RV32-NEXT: lw a1, 16(sp) -; RV32-NEXT: lw a2, 20(sp) -; RV32-NEXT: lw a3, 24(sp) -; RV32-NEXT: lw a4, 28(sp) +; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a3, 20(sp) +; RV32-NEXT: lw a1, 24(sp) +; RV32-NEXT: lw a2, 28(sp) ; RV32-NEXT: beqz a0, .LBB62_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: sw a1, 0(s1) -; RV32-NEXT: sw a2, 4(s1) -; RV32-NEXT: sw a3, 8(s1) -; RV32-NEXT: sw a4, 12(s1) +; RV32-NEXT: sw a4, 0(s1) +; RV32-NEXT: sw a3, 4(s1) +; RV32-NEXT: sw a1, 8(s1) +; RV32-NEXT: sw a2, 12(s1) ; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll index 0bde85b54e5d1..581ee5cd2304d 100644 --- a/llvm/test/CodeGen/RISCV/fp128.ll +++ b/llvm/test/CodeGen/RISCV/fp128.ll @@ -14,19 +14,19 @@ define i32 @test_load_and_cmp() nounwind { ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a0, %hi(x) +; RV32I-NEXT: lui a1, %hi(y) ; RV32I-NEXT: lw a2, %lo(x)(a0) ; RV32I-NEXT: lw a3, %lo(x+4)(a0) ; RV32I-NEXT: lw a4, %lo(x+8)(a0) ; RV32I-NEXT: lw a5, %lo(x+12)(a0) -; RV32I-NEXT: lui a0, %hi(y) -; RV32I-NEXT: lw a1, %lo(y)(a0) -; RV32I-NEXT: lw a6, %lo(y+4)(a0) -; RV32I-NEXT: lw a7, %lo(y+8)(a0) -; RV32I-NEXT: lw a0, %lo(y+12)(a0) -; RV32I-NEXT: sw a1, 8(sp) +; RV32I-NEXT: lw a0, %lo(y)(a1) +; RV32I-NEXT: lw a6, %lo(y+4)(a1) +; RV32I-NEXT: lw a7, %lo(y+8)(a1) +; RV32I-NEXT: lw a1, %lo(y+12)(a1) +; RV32I-NEXT: sw a0, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw a1, 20(sp) ; RV32I-NEXT: addi a0, sp, 24 ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: sw a2, 24(sp) @@ -51,19 +51,19 @@ define i32 @test_add_and_fptosi() nounwind { ; RV32I-NEXT: addi sp, sp, -80 ; RV32I-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a0, %hi(x) +; RV32I-NEXT: lui a1, %hi(y) ; RV32I-NEXT: lw a3, %lo(x)(a0) ; RV32I-NEXT: lw a4, %lo(x+4)(a0) ; RV32I-NEXT: lw a5, %lo(x+8)(a0) ; RV32I-NEXT: lw a6, %lo(x+12)(a0) -; RV32I-NEXT: lui a0, %hi(y) -; RV32I-NEXT: lw a1, %lo(y)(a0) -; RV32I-NEXT: lw a2, %lo(y+4)(a0) -; RV32I-NEXT: lw a7, %lo(y+8)(a0) -; RV32I-NEXT: lw a0, %lo(y+12)(a0) -; RV32I-NEXT: sw a1, 24(sp) +; RV32I-NEXT: lw a0, %lo(y)(a1) +; RV32I-NEXT: lw a2, %lo(y+4)(a1) +; RV32I-NEXT: lw a7, %lo(y+8)(a1) +; RV32I-NEXT: lw a1, %lo(y+12)(a1) +; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw a2, 28(sp) ; RV32I-NEXT: sw a7, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a1, 36(sp) ; RV32I-NEXT: addi a0, sp, 56 ; RV32I-NEXT: addi a1, sp, 40 ; RV32I-NEXT: addi a2, sp, 24 diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index bbdfda5c1e10d..c5c3b199447a9 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -1282,8 +1282,8 @@ define i64 @utest_f64i64(double %x) { ; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 8(sp) ; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: seqz a4, a4 ; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: seqz a4, a4 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: seqz a0, a0 ; RV32IF-NEXT: addi a0, a0, -1 @@ -1326,8 +1326,8 @@ define i64 @utest_f64i64(double %x) { ; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 8(sp) ; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: seqz a4, a4 ; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: seqz a4, a4 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: seqz a0, a0 ; RV32IFD-NEXT: addi a0, a0, -1 @@ -1592,8 +1592,8 @@ define i64 @utest_f32i64(float %x) { ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) ; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a0, a0, -1 @@ -1853,8 +1853,8 @@ define i64 @utesth_f16i64(half %x) { ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) ; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a0, a0, -1 @@ -2168,8 +2168,8 @@ define i32 @ustest_f64i32_mm(double %x) { ; RV32IF-NEXT: slti a2, a1, 1 ; RV32IF-NEXT: .LBB29_3: # %entry ; RV32IF-NEXT: addi a3, a2, -1 -; RV32IF-NEXT: or a0, a3, a0 ; RV32IF-NEXT: neg a2, a2 +; RV32IF-NEXT: or a0, a3, a0 ; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: slti a1, a1, 0 ; RV32IF-NEXT: addi a1, a1, -1 @@ -2459,8 +2459,8 @@ define i32 @ustest_f16i32_mm(half %x) { ; RV32-NEXT: slti a2, a1, 1 ; RV32-NEXT: .LBB35_3: # %entry ; RV32-NEXT: addi a3, a2, -1 -; RV32-NEXT: or a0, a3, a0 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: or a0, a3, a0 ; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: slti a1, a1, 0 ; RV32-NEXT: addi a1, a1, -1 @@ -3216,8 +3216,8 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 8(sp) ; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: seqz a4, a4 ; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: seqz a4, a4 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: seqz a0, a0 ; RV32IF-NEXT: addi a0, a0, -1 @@ -3260,8 +3260,8 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 8(sp) ; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: seqz a4, a4 ; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: seqz a4, a4 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: seqz a0, a0 ; RV32IFD-NEXT: addi a0, a0, -1 @@ -3335,11 +3335,11 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV64-NEXT: li a2, 1 ; RV64-NEXT: .LBB47_2: # %entry ; RV64-NEXT: slti a1, a1, 1 +; RV64-NEXT: slti a2, a2, 0 ; RV64-NEXT: neg a1, a1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: slti a1, a2, 0 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: addi sp, sp, 16 @@ -3484,8 +3484,8 @@ define i64 @utest_f32i64_mm(float %x) { ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) ; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a0, a0, -1 @@ -3573,11 +3573,11 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV64-NEXT: li a2, 1 ; RV64-NEXT: .LBB50_2: # %entry ; RV64-NEXT: slti a1, a1, 1 +; RV64-NEXT: slti a2, a2, 0 ; RV64-NEXT: neg a1, a1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: slti a1, a2, 0 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: addi sp, sp, 16 @@ -3719,8 +3719,8 @@ define i64 @utesth_f16i64_mm(half %x) { ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) ; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: seqz a4, a4 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a0, a0, -1 @@ -3811,11 +3811,11 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV64-NEXT: li a2, 1 ; RV64-NEXT: .LBB53_2: # %entry ; RV64-NEXT: slti a1, a1, 1 +; RV64-NEXT: slti a2, a2, 0 ; RV64-NEXT: neg a1, a1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: slti a1, a2, 0 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/fpenv.ll b/llvm/test/CodeGen/RISCV/fpenv.ll index 48aec0b70b0d4..895effb4ce49b 100644 --- a/llvm/test/CodeGen/RISCV/fpenv.ll +++ b/llvm/test/CodeGen/RISCV/fpenv.ll @@ -6,8 +6,8 @@ define i32 @func_01() { ; RV32IF-LABEL: func_01: ; RV32IF: # %bb.0: ; RV32IF-NEXT: frrm a0 -; RV32IF-NEXT: slli a0, a0, 2 ; RV32IF-NEXT: lui a1, 66 +; RV32IF-NEXT: slli a0, a0, 2 ; RV32IF-NEXT: addi a1, a1, 769 ; RV32IF-NEXT: srl a0, a1, a0 ; RV32IF-NEXT: andi a0, a0, 7 @@ -16,8 +16,8 @@ define i32 @func_01() { ; RV64IF-LABEL: func_01: ; RV64IF: # %bb.0: ; RV64IF-NEXT: frrm a0 -; RV64IF-NEXT: slli a0, a0, 2 ; RV64IF-NEXT: lui a1, 66 +; RV64IF-NEXT: slli a0, a0, 2 ; RV64IF-NEXT: addiw a1, a1, 769 ; RV64IF-NEXT: srl a0, a1, a0 ; RV64IF-NEXT: andi a0, a0, 7 @@ -40,8 +40,8 @@ define void @func_02(i32 %rm) { ; RV64IF-LABEL: func_02: ; RV64IF: # %bb.0: ; RV64IF-NEXT: slli a0, a0, 32 -; RV64IF-NEXT: srli a0, a0, 30 ; RV64IF-NEXT: lui a1, 66 +; RV64IF-NEXT: srli a0, a0, 30 ; RV64IF-NEXT: addiw a1, a1, 769 ; RV64IF-NEXT: srl a0, a1, a0 ; RV64IF-NEXT: andi a0, a0, 7 diff --git a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll index 0f9511125adba..c4c14c6cb8726 100644 --- a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll +++ b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll @@ -33,50 +33,50 @@ define ghccc void @foo() nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(d6) +; CHECK-NEXT: lui a1, %hi(d5) +; CHECK-NEXT: lui a2, %hi(d4) +; CHECK-NEXT: lui a3, %hi(d3) +; CHECK-NEXT: lui a4, %hi(d2) +; CHECK-NEXT: lui a5, %hi(d1) +; CHECK-NEXT: lui a6, %hi(f6) +; CHECK-NEXT: lui a7, %hi(f5) +; CHECK-NEXT: lui t0, %hi(f4) +; CHECK-NEXT: lui t1, %hi(f3) +; CHECK-NEXT: lui t2, %hi(f2) ; CHECK-NEXT: fld fs11, %lo(d6)(a0) -; CHECK-NEXT: lui a0, %hi(d5) -; CHECK-NEXT: fld fs10, %lo(d5)(a0) -; CHECK-NEXT: lui a0, %hi(d4) -; CHECK-NEXT: fld fs9, %lo(d4)(a0) -; CHECK-NEXT: lui a0, %hi(d3) -; CHECK-NEXT: fld fs8, %lo(d3)(a0) -; CHECK-NEXT: lui a0, %hi(d2) -; CHECK-NEXT: fld fs7, %lo(d2)(a0) -; CHECK-NEXT: lui a0, %hi(d1) -; CHECK-NEXT: fld fs6, %lo(d1)(a0) -; CHECK-NEXT: lui a0, %hi(f6) -; CHECK-NEXT: flw fs5, %lo(f6)(a0) -; CHECK-NEXT: lui a0, %hi(f5) -; CHECK-NEXT: flw fs4, %lo(f5)(a0) -; CHECK-NEXT: lui a0, %hi(f4) -; CHECK-NEXT: flw fs3, %lo(f4)(a0) -; CHECK-NEXT: lui a0, %hi(f3) -; CHECK-NEXT: flw fs2, %lo(f3)(a0) -; CHECK-NEXT: lui a0, %hi(f2) -; CHECK-NEXT: flw fs1, %lo(f2)(a0) ; CHECK-NEXT: lui a0, %hi(f1) +; CHECK-NEXT: fld fs10, %lo(d5)(a1) +; CHECK-NEXT: lui a1, %hi(splim) +; CHECK-NEXT: fld fs9, %lo(d4)(a2) +; CHECK-NEXT: lui a2, %hi(r7) +; CHECK-NEXT: fld fs8, %lo(d3)(a3) +; CHECK-NEXT: lui a3, %hi(r6) +; CHECK-NEXT: fld fs7, %lo(d2)(a4) +; CHECK-NEXT: lui a4, %hi(r5) +; CHECK-NEXT: fld fs6, %lo(d1)(a5) +; CHECK-NEXT: lui a5, %hi(r4) +; CHECK-NEXT: flw fs5, %lo(f6)(a6) +; CHECK-NEXT: lui a6, %hi(r3) +; CHECK-NEXT: flw fs4, %lo(f5)(a7) +; CHECK-NEXT: lui a7, %hi(r2) +; CHECK-NEXT: flw fs3, %lo(f4)(t0) +; CHECK-NEXT: lui t0, %hi(r1) +; CHECK-NEXT: flw fs2, %lo(f3)(t1) +; CHECK-NEXT: lui t1, %hi(hp) +; CHECK-NEXT: flw fs1, %lo(f2)(t2) +; CHECK-NEXT: lui t2, %hi(sp) ; CHECK-NEXT: flw fs0, %lo(f1)(a0) -; CHECK-NEXT: lui a0, %hi(splim) -; CHECK-NEXT: lw s11, %lo(splim)(a0) -; CHECK-NEXT: lui a0, %hi(r7) -; CHECK-NEXT: lw s10, %lo(r7)(a0) -; CHECK-NEXT: lui a0, %hi(r6) -; CHECK-NEXT: lw s9, %lo(r6)(a0) -; CHECK-NEXT: lui a0, %hi(r5) -; CHECK-NEXT: lw s8, %lo(r5)(a0) -; CHECK-NEXT: lui a0, %hi(r4) -; CHECK-NEXT: lw s7, %lo(r4)(a0) -; CHECK-NEXT: lui a0, %hi(r3) -; CHECK-NEXT: lw s6, %lo(r3)(a0) -; CHECK-NEXT: lui a0, %hi(r2) -; CHECK-NEXT: lw s5, %lo(r2)(a0) -; CHECK-NEXT: lui a0, %hi(r1) -; CHECK-NEXT: lw s4, %lo(r1)(a0) -; CHECK-NEXT: lui a0, %hi(hp) -; CHECK-NEXT: lw s3, %lo(hp)(a0) -; CHECK-NEXT: lui a0, %hi(sp) -; CHECK-NEXT: lw s2, %lo(sp)(a0) ; CHECK-NEXT: lui a0, %hi(base) +; CHECK-NEXT: lw s11, %lo(splim)(a1) +; CHECK-NEXT: lw s10, %lo(r7)(a2) +; CHECK-NEXT: lw s9, %lo(r6)(a3) +; CHECK-NEXT: lw s8, %lo(r5)(a4) +; CHECK-NEXT: lw s7, %lo(r4)(a5) +; CHECK-NEXT: lw s6, %lo(r3)(a6) +; CHECK-NEXT: lw s5, %lo(r2)(a7) +; CHECK-NEXT: lw s4, %lo(r1)(t0) +; CHECK-NEXT: lw s3, %lo(hp)(t1) +; CHECK-NEXT: lw s2, %lo(sp)(t2) ; CHECK-NEXT: lw s1, %lo(base)(a0) ; CHECK-NEXT: tail bar entry: diff --git a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll index 79afd4bc375d5..8e3fd2ca709aa 100644 --- a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll +++ b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll @@ -33,50 +33,50 @@ define ghccc void @foo() nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(d6) +; CHECK-NEXT: lui a1, %hi(d5) +; CHECK-NEXT: lui a2, %hi(d4) +; CHECK-NEXT: lui a3, %hi(d3) +; CHECK-NEXT: lui a4, %hi(d2) +; CHECK-NEXT: lui a5, %hi(d1) +; CHECK-NEXT: lui a6, %hi(f6) +; CHECK-NEXT: lui a7, %hi(f5) +; CHECK-NEXT: lui t0, %hi(f4) +; CHECK-NEXT: lui t1, %hi(f3) +; CHECK-NEXT: lui t2, %hi(f2) ; CHECK-NEXT: fld fs11, %lo(d6)(a0) -; CHECK-NEXT: lui a0, %hi(d5) -; CHECK-NEXT: fld fs10, %lo(d5)(a0) -; CHECK-NEXT: lui a0, %hi(d4) -; CHECK-NEXT: fld fs9, %lo(d4)(a0) -; CHECK-NEXT: lui a0, %hi(d3) -; CHECK-NEXT: fld fs8, %lo(d3)(a0) -; CHECK-NEXT: lui a0, %hi(d2) -; CHECK-NEXT: fld fs7, %lo(d2)(a0) -; CHECK-NEXT: lui a0, %hi(d1) -; CHECK-NEXT: fld fs6, %lo(d1)(a0) -; CHECK-NEXT: lui a0, %hi(f6) -; CHECK-NEXT: flw fs5, %lo(f6)(a0) -; CHECK-NEXT: lui a0, %hi(f5) -; CHECK-NEXT: flw fs4, %lo(f5)(a0) -; CHECK-NEXT: lui a0, %hi(f4) -; CHECK-NEXT: flw fs3, %lo(f4)(a0) -; CHECK-NEXT: lui a0, %hi(f3) -; CHECK-NEXT: flw fs2, %lo(f3)(a0) -; CHECK-NEXT: lui a0, %hi(f2) -; CHECK-NEXT: flw fs1, %lo(f2)(a0) ; CHECK-NEXT: lui a0, %hi(f1) +; CHECK-NEXT: fld fs10, %lo(d5)(a1) +; CHECK-NEXT: lui a1, %hi(splim) +; CHECK-NEXT: fld fs9, %lo(d4)(a2) +; CHECK-NEXT: lui a2, %hi(r7) +; CHECK-NEXT: fld fs8, %lo(d3)(a3) +; CHECK-NEXT: lui a3, %hi(r6) +; CHECK-NEXT: fld fs7, %lo(d2)(a4) +; CHECK-NEXT: lui a4, %hi(r5) +; CHECK-NEXT: fld fs6, %lo(d1)(a5) +; CHECK-NEXT: lui a5, %hi(r4) +; CHECK-NEXT: flw fs5, %lo(f6)(a6) +; CHECK-NEXT: lui a6, %hi(r3) +; CHECK-NEXT: flw fs4, %lo(f5)(a7) +; CHECK-NEXT: lui a7, %hi(r2) +; CHECK-NEXT: flw fs3, %lo(f4)(t0) +; CHECK-NEXT: lui t0, %hi(r1) +; CHECK-NEXT: flw fs2, %lo(f3)(t1) +; CHECK-NEXT: lui t1, %hi(hp) +; CHECK-NEXT: flw fs1, %lo(f2)(t2) +; CHECK-NEXT: lui t2, %hi(sp) ; CHECK-NEXT: flw fs0, %lo(f1)(a0) -; CHECK-NEXT: lui a0, %hi(splim) -; CHECK-NEXT: ld s11, %lo(splim)(a0) -; CHECK-NEXT: lui a0, %hi(r7) -; CHECK-NEXT: ld s10, %lo(r7)(a0) -; CHECK-NEXT: lui a0, %hi(r6) -; CHECK-NEXT: ld s9, %lo(r6)(a0) -; CHECK-NEXT: lui a0, %hi(r5) -; CHECK-NEXT: ld s8, %lo(r5)(a0) -; CHECK-NEXT: lui a0, %hi(r4) -; CHECK-NEXT: ld s7, %lo(r4)(a0) -; CHECK-NEXT: lui a0, %hi(r3) -; CHECK-NEXT: ld s6, %lo(r3)(a0) -; CHECK-NEXT: lui a0, %hi(r2) -; CHECK-NEXT: ld s5, %lo(r2)(a0) -; CHECK-NEXT: lui a0, %hi(r1) -; CHECK-NEXT: ld s4, %lo(r1)(a0) -; CHECK-NEXT: lui a0, %hi(hp) -; CHECK-NEXT: ld s3, %lo(hp)(a0) -; CHECK-NEXT: lui a0, %hi(sp) -; CHECK-NEXT: ld s2, %lo(sp)(a0) ; CHECK-NEXT: lui a0, %hi(base) +; CHECK-NEXT: ld s11, %lo(splim)(a1) +; CHECK-NEXT: ld s10, %lo(r7)(a2) +; CHECK-NEXT: ld s9, %lo(r6)(a3) +; CHECK-NEXT: ld s8, %lo(r5)(a4) +; CHECK-NEXT: ld s7, %lo(r4)(a5) +; CHECK-NEXT: ld s6, %lo(r3)(a6) +; CHECK-NEXT: ld s5, %lo(r2)(a7) +; CHECK-NEXT: ld s4, %lo(r1)(t0) +; CHECK-NEXT: ld s3, %lo(hp)(t1) +; CHECK-NEXT: ld s2, %lo(sp)(t2) ; CHECK-NEXT: ld s1, %lo(base)(a0) ; CHECK-NEXT: tail bar entry: diff --git a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll index 6437beae09015..abc555b994a3b 100644 --- a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll +++ b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll @@ -14,17 +14,17 @@ define ghccc void @caller_float() nounwind { ; CHECK-LABEL: caller_float: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(f6) +; CHECK-NEXT: lui a1, %hi(f5) +; CHECK-NEXT: lui a2, %hi(f4) +; CHECK-NEXT: lui a3, %hi(f3) +; CHECK-NEXT: lui a4, %hi(f2) +; CHECK-NEXT: lui a5, %hi(f1) ; CHECK-NEXT: lw s6, %lo(f6)(a0) -; CHECK-NEXT: lui a0, %hi(f5) -; CHECK-NEXT: lw s5, %lo(f5)(a0) -; CHECK-NEXT: lui a0, %hi(f4) -; CHECK-NEXT: lw s4, %lo(f4)(a0) -; CHECK-NEXT: lui a0, %hi(f3) -; CHECK-NEXT: lw s3, %lo(f3)(a0) -; CHECK-NEXT: lui a0, %hi(f2) -; CHECK-NEXT: lw s2, %lo(f2)(a0) -; CHECK-NEXT: lui a0, %hi(f1) -; CHECK-NEXT: lw s1, %lo(f1)(a0) +; CHECK-NEXT: lw s5, %lo(f5)(a1) +; CHECK-NEXT: lw s4, %lo(f4)(a2) +; CHECK-NEXT: lw s3, %lo(f3)(a3) +; CHECK-NEXT: lw s2, %lo(f2)(a4) +; CHECK-NEXT: lw s1, %lo(f1)(a5) ; CHECK-NEXT: tail callee_float entry: %0 = load float, ptr @f6 @@ -50,17 +50,17 @@ define ghccc void @caller_double() nounwind { ; CHECK-LABEL: caller_double: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(d6) +; CHECK-NEXT: lui a1, %hi(d5) +; CHECK-NEXT: lui a2, %hi(d4) +; CHECK-NEXT: lui a3, %hi(d3) +; CHECK-NEXT: lui a4, %hi(d2) +; CHECK-NEXT: lui a5, %hi(d1) ; CHECK-NEXT: ld s6, %lo(d6)(a0) -; CHECK-NEXT: lui a0, %hi(d5) -; CHECK-NEXT: ld s5, %lo(d5)(a0) -; CHECK-NEXT: lui a0, %hi(d4) -; CHECK-NEXT: ld s4, %lo(d4)(a0) -; CHECK-NEXT: lui a0, %hi(d3) -; CHECK-NEXT: ld s3, %lo(d3)(a0) -; CHECK-NEXT: lui a0, %hi(d2) -; CHECK-NEXT: ld s2, %lo(d2)(a0) -; CHECK-NEXT: lui a0, %hi(d1) -; CHECK-NEXT: ld s1, %lo(d1)(a0) +; CHECK-NEXT: ld s5, %lo(d5)(a1) +; CHECK-NEXT: ld s4, %lo(d4)(a2) +; CHECK-NEXT: ld s3, %lo(d3)(a3) +; CHECK-NEXT: ld s2, %lo(d2)(a4) +; CHECK-NEXT: ld s1, %lo(d1)(a5) ; CHECK-NEXT: tail callee_double entry: %0 = load double, ptr @d6 diff --git a/llvm/test/CodeGen/RISCV/global-merge.ll b/llvm/test/CodeGen/RISCV/global-merge.ll index 633ba719c6a30..9dde032b69f8b 100644 --- a/llvm/test/CodeGen/RISCV/global-merge.ll +++ b/llvm/test/CodeGen/RISCV/global-merge.ll @@ -23,12 +23,12 @@ define void @f1(i32 %a) nounwind { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(eg1) +; CHECK-NEXT: sw a0, %lo(eg1)(a1) ; CHECK-NEXT: lui a1, %hi(.L_MergedGlobals) ; CHECK-NEXT: sw a0, %lo(.L_MergedGlobals)(a1) ; CHECK-NEXT: addi a1, a1, %lo(.L_MergedGlobals) ; CHECK-NEXT: sw a0, 4(a1) -; CHECK-NEXT: lui a1, %hi(eg1) -; CHECK-NEXT: sw a0, %lo(eg1)(a1) ; CHECK-NEXT: lui a1, %hi(eg2) ; CHECK-NEXT: sw a0, %lo(eg2)(a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-arith-strict.ll b/llvm/test/CodeGen/RISCV/half-arith-strict.ll index 4e4aad7309791..636739cf38984 100644 --- a/llvm/test/CodeGen/RISCV/half-arith-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-arith-strict.ll @@ -243,28 +243,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZFHMIN: # %bb.0: ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa2 ; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-NEXT: lui a0, 1048568 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa1 ; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECK-ZFHMIN-NEXT: lui a1, 1048568 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 +; CHECK-ZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECK-ZFHMIN-NEXT: xor a0, a1, a0 ; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK-ZFHMIN-NEXT: ret ; ; CHECK-ZHINXMIN-LABEL: fmsub_h: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 +; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 ; CHECK-ZHINXMIN-NEXT: xor a2, a2, a3 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -295,17 +295,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZFHMIN: # %bb.0: ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 +; CHECK-ZFHMIN-NEXT: lui a0, 1048568 +; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-ZFHMIN-NEXT: fadd.s fa4, fa3, fa4 +; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECK-ZFHMIN-NEXT: lui a1, 1048568 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 -; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a0 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa4 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 +; CHECK-ZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECK-ZFHMIN-NEXT: fmv.x.h a2, fa4 +; CHECK-ZFHMIN-NEXT: xor a1, a1, a0 +; CHECK-ZFHMIN-NEXT: xor a0, a2, a0 +; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a1 ; CHECK-ZFHMIN-NEXT: fmv.h.x fa4, a0 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -317,12 +317,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINXMIN-LABEL: fnmadd_h: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 +; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 ; CHECK-ZHINXMIN-NEXT: xor a0, a0, a3 ; CHECK-ZHINXMIN-NEXT: xor a2, a2, a3 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 @@ -359,17 +359,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZFHMIN: # %bb.0: ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 +; CHECK-ZFHMIN-NEXT: lui a0, 1048568 +; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-ZFHMIN-NEXT: fadd.s fa4, fa3, fa4 +; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECK-ZFHMIN-NEXT: lui a1, 1048568 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 -; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a0 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa4 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 +; CHECK-ZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECK-ZFHMIN-NEXT: fmv.x.h a2, fa4 +; CHECK-ZFHMIN-NEXT: xor a1, a1, a0 +; CHECK-ZFHMIN-NEXT: xor a0, a2, a0 +; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a1 ; CHECK-ZFHMIN-NEXT: fmv.h.x fa4, a0 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -381,12 +381,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZHINXMIN-LABEL: fnmadd_h_2: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 +; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero ; CHECK-ZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 ; CHECK-ZHINXMIN-NEXT: xor a1, a1, a3 ; CHECK-ZHINXMIN-NEXT: xor a2, a2, a3 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 @@ -421,28 +421,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZFHMIN: # %bb.0: ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-NEXT: lui a0, 1048568 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 ; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECK-ZFHMIN-NEXT: lui a1, 1048568 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 +; CHECK-ZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECK-ZFHMIN-NEXT: xor a0, a1, a0 ; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa5, fa4, fa3 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK-ZFHMIN-NEXT: ret ; ; CHECK-ZHINXMIN-LABEL: fnmsub_h: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 +; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECK-ZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 ; CHECK-ZHINXMIN-NEXT: xor a0, a0, a3 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -471,28 +471,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind strictfp { ; CHECK-ZFHMIN: # %bb.0: ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; CHECK-ZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK-ZFHMIN-NEXT: lui a0, 1048568 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa2 ; CHECK-ZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECK-ZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECK-ZFHMIN-NEXT: lui a1, 1048568 -; CHECK-ZFHMIN-NEXT: xor a0, a0, a1 +; CHECK-ZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECK-ZFHMIN-NEXT: xor a0, a1, a0 ; CHECK-ZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECK-ZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECK-ZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECK-ZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECK-ZFHMIN-NEXT: fmadd.s fa5, fa4, fa5, fa3 ; CHECK-ZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK-ZFHMIN-NEXT: ret ; ; CHECK-ZHINXMIN-LABEL: fnmsub_h_2: ; CHECK-ZHINXMIN: # %bb.0: ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 +; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECK-ZHINXMIN-NEXT: fadd.s a1, a1, zero ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECK-ZHINXMIN-NEXT: lui a3, 1048568 ; CHECK-ZHINXMIN-NEXT: xor a1, a1, a3 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECK-ZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECK-ZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK-ZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECK-ZHINXMIN-NEXT: fcvt.h.s a0, a0 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 4c2deafdc7e66..a218e89948d4b 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -425,8 +425,8 @@ define half @fsgnj_h(half %a, half %b) nounwind { ; RV32I-LABEL: fsgnj_h: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 1048568 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 17 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -434,8 +434,8 @@ define half @fsgnj_h(half %a, half %b) nounwind { ; RV64I-LABEL: fsgnj_h: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 1048568 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 49 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -469,8 +469,8 @@ define half @fsgnj_h(half %a, half %b) nounwind { ; RV32IZHINXMIN-NEXT: # kill: def $x11_h killed $x11_h def $x11 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32IZHINXMIN-NEXT: lui a2, 1048568 -; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: slli a0, a0, 17 +; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: or a0, a0, a1 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -481,8 +481,8 @@ define half @fsgnj_h(half %a, half %b) nounwind { ; RV64IZHINXMIN-NEXT: # kill: def $x11_h killed $x11_h def $x11 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64IZHINXMIN-NEXT: lui a2, 1048568 -; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: slli a0, a0, 49 +; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: or a0, a0, a1 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -573,11 +573,11 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; CHECKIZFHMIN-LABEL: fneg_h: ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECKIZFHMIN-NEXT: lui a0, 1048568 ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: xor a0, a1, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa4, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -587,9 +587,9 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; CHECKIZHINXMIN-LABEL: fneg_h: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: lui a1, 1048568 ; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKIZHINXMIN-NEXT: lui a1, 1048568 ; CHECKIZHINXMIN-NEXT: xor a1, a0, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 @@ -644,8 +644,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lui a1, 1048568 -; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli s1, s1, 17 +; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: srli s1, s1, 17 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -683,8 +683,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: lui a1, 1048568 -; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli s1, s1, 49 +; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: srli s1, s1, 49 ; RV64I-NEXT: or a0, s1, a0 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -699,12 +699,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV32IZFHMIN-NEXT: lui a0, 1048568 ; RV32IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fmv.x.h a0, fa5 -; RV32IZFHMIN-NEXT: not a0, a0 -; RV32IZFHMIN-NEXT: lui a1, 1048568 -; RV32IZFHMIN-NEXT: and a0, a0, a1 +; RV32IZFHMIN-NEXT: fmv.x.h a1, fa5 +; RV32IZFHMIN-NEXT: not a1, a1 +; RV32IZFHMIN-NEXT: and a0, a1, a0 ; RV32IZFHMIN-NEXT: fmv.x.h a1, fa0 ; RV32IZFHMIN-NEXT: slli a1, a1, 17 ; RV32IZFHMIN-NEXT: srli a1, a1, 17 @@ -716,12 +716,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV64IZFHMIN: # %bb.0: ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; RV64IZFHMIN-NEXT: lui a0, 1048568 ; RV64IZFHMIN-NEXT: fadd.s fa5, fa4, fa5 ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV64IZFHMIN-NEXT: fmv.x.h a0, fa5 -; RV64IZFHMIN-NEXT: not a0, a0 -; RV64IZFHMIN-NEXT: lui a1, 1048568 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: fmv.x.h a1, fa5 +; RV64IZFHMIN-NEXT: not a1, a1 +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: fmv.x.h a1, fa0 ; RV64IZFHMIN-NEXT: slli a1, a1, 49 ; RV64IZFHMIN-NEXT: srli a1, a1, 49 @@ -735,11 +735,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 ; RV32IZHINXMIN-NEXT: fcvt.s.h a2, a0 ; RV32IZHINXMIN-NEXT: fadd.s a1, a2, a1 +; RV32IZHINXMIN-NEXT: lui a2, 1048568 +; RV32IZHINXMIN-NEXT: slli a0, a0, 17 ; RV32IZHINXMIN-NEXT: fcvt.h.s a1, a1 ; RV32IZHINXMIN-NEXT: not a1, a1 -; RV32IZHINXMIN-NEXT: lui a2, 1048568 ; RV32IZHINXMIN-NEXT: and a1, a1, a2 -; RV32IZHINXMIN-NEXT: slli a0, a0, 17 ; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: or a0, a0, a1 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -751,11 +751,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 ; RV64IZHINXMIN-NEXT: fcvt.s.h a2, a0 ; RV64IZHINXMIN-NEXT: fadd.s a1, a2, a1 +; RV64IZHINXMIN-NEXT: lui a2, 1048568 +; RV64IZHINXMIN-NEXT: slli a0, a0, 49 ; RV64IZHINXMIN-NEXT: fcvt.h.s a1, a1 ; RV64IZHINXMIN-NEXT: not a1, a1 -; RV64IZHINXMIN-NEXT: lui a2, 1048568 ; RV64IZHINXMIN-NEXT: and a1, a1, a2 -; RV64IZHINXMIN-NEXT: slli a0, a0, 49 ; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: or a0, a0, a1 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -1298,28 +1298,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: xor a0, a1, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa4, fa3, fa5 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fmsub_h: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: lui a3, 1048568 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero ; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKIZHINXMIN-NEXT: lui a3, 1048568 ; CHECKIZHINXMIN-NEXT: xor a2, a2, a3 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -1466,17 +1466,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fadd.s fa4, fa3, fa4 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 -; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa4 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: fmv.x.h a2, fa4 +; CHECKIZFHMIN-NEXT: xor a1, a1, a0 +; CHECKIZFHMIN-NEXT: xor a0, a2, a0 +; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a1 ; CHECKIZFHMIN-NEXT: fmv.h.x fa4, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -1488,12 +1488,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; CHECKIZHINXMIN-LABEL: fnmadd_h: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: lui a3, 1048568 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKIZHINXMIN-NEXT: lui a3, 1048568 ; CHECKIZHINXMIN-NEXT: xor a0, a0, a3 ; CHECKIZHINXMIN-NEXT: xor a2, a2, a3 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 @@ -1646,17 +1646,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fadd.s fa4, fa3, fa4 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 -; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa4 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: fmv.x.h a2, fa4 +; CHECKIZFHMIN-NEXT: xor a1, a1, a0 +; CHECKIZFHMIN-NEXT: xor a0, a2, a0 +; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a1 ; CHECKIZFHMIN-NEXT: fmv.h.x fa4, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -1668,12 +1668,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; CHECKIZHINXMIN-LABEL: fnmadd_h_2: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: lui a3, 1048568 +; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero ; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 -; CHECKIZHINXMIN-NEXT: lui a3, 1048568 ; CHECKIZHINXMIN-NEXT: xor a1, a1, a3 ; CHECKIZHINXMIN-NEXT: xor a2, a2, a3 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 @@ -2039,28 +2039,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: xor a0, a1, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa5, fa3, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa5, fa4, fa3 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fnmsub_h: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: lui a3, 1048568 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKIZHINXMIN-NEXT: lui a3, 1048568 ; CHECKIZHINXMIN-NEXT: xor a0, a0, a3 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -2177,28 +2177,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa2 ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: xor a0, a1, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa4, fa5, fa3 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fnmsub_h_2: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: lui a3, 1048568 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero ; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKIZHINXMIN-NEXT: lui a3, 1048568 ; CHECKIZHINXMIN-NEXT: xor a1, a1, a3 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fmadd.s a0, a0, a1, a2 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -2430,11 +2430,11 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa2, fa0 ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; CHECKIZFHMIN-NEXT: fmul.s fa4, fa2, fa3 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmul.s fa4, fa3, fa4 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 @@ -2445,13 +2445,13 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZHINXMIN-LABEL: fmsub_h_contract: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero ; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a2 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a2 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fsub.s a0, a0, a1 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 @@ -2606,21 +2606,21 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 -; CHECKIZFHMIN-NEXT: fadd.s fa3, fa3, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa3, fa3 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa2, fa2 +; CHECKIZFHMIN-NEXT: lui a0, 1048568 +; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 +; CHECKIZFHMIN-NEXT: fadd.s fa3, fa3, fa4 ; CHECKIZFHMIN-NEXT: fadd.s fa4, fa2, fa4 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa3, fa3 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa3 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 ; CHECKIZFHMIN-NEXT: fmul.s fa5, fa5, fa3 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa5 -; CHECKIZFHMIN-NEXT: lui a1, 1048568 -; CHECKIZFHMIN-NEXT: xor a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.x.h a1, fa5 +; CHECKIZFHMIN-NEXT: xor a0, a1, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 @@ -2631,19 +2631,19 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZHINXMIN-LABEL: fnmadd_h_contract: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a2 -; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fmul.s a0, a0, a1 -; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: lui a1, 1048568 +; CHECKIZHINXMIN-NEXT: fadd.s a2, a2, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a2, a2 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: xor a0, a0, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a2 @@ -2781,10 +2781,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa1 +; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fadd.s fa4, fa3, fa4 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 @@ -2799,10 +2799,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind { ; CHECKIZHINXMIN-LABEL: fnmsub_h_contract: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero -; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, zero ; CHECKIZHINXMIN-NEXT: fadd.s a1, a1, zero +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 diff --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll index e0c47bfac6fec..730bde5af610b 100644 --- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll +++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll @@ -165,8 +165,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: lui a2, 1048568 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 17 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -183,8 +183,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: not a1, a1 ; RV64I-NEXT: lui a2, 1048568 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 49 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -211,8 +211,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: not a1, a1 ; RV32IZFHMIN-NEXT: lui a2, 1048568 -; RV32IZFHMIN-NEXT: and a1, a1, a2 ; RV32IZFHMIN-NEXT: slli a0, a0, 17 +; RV32IZFHMIN-NEXT: and a1, a1, a2 ; RV32IZFHMIN-NEXT: srli a0, a0, 17 ; RV32IZFHMIN-NEXT: or a0, a0, a1 ; RV32IZFHMIN-NEXT: ret @@ -221,8 +221,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV64IZFHMIN: # %bb.0: ; RV64IZFHMIN-NEXT: not a1, a1 ; RV64IZFHMIN-NEXT: lui a2, 1048568 -; RV64IZFHMIN-NEXT: and a1, a1, a2 ; RV64IZFHMIN-NEXT: slli a0, a0, 49 +; RV64IZFHMIN-NEXT: and a1, a1, a2 ; RV64IZFHMIN-NEXT: srli a0, a0, 49 ; RV64IZFHMIN-NEXT: or a0, a0, a1 ; RV64IZFHMIN-NEXT: ret @@ -233,8 +233,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32IZHINXMIN-NEXT: not a1, a1 ; RV32IZHINXMIN-NEXT: lui a2, 1048568 -; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: slli a0, a0, 17 +; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: or a0, a0, a1 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -246,8 +246,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind { ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64IZHINXMIN-NEXT: not a1, a1 ; RV64IZHINXMIN-NEXT: lui a2, 1048568 -; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: slli a0, a0, 49 +; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: or a0, a0, a1 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 diff --git a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll index 6699ee9479379..e9b142e33362f 100644 --- a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll @@ -927,9 +927,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind { ; RV32IZFHMIN-LABEL: br_fcmp_ord: ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa0 ; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; RV32IZFHMIN-NEXT: feq.s a1, fa4, fa4 ; RV32IZFHMIN-NEXT: and a0, a1, a0 ; RV32IZFHMIN-NEXT: bnez a0, .LBB8_2 ; RV32IZFHMIN-NEXT: # %bb.1: # %if.else @@ -942,9 +942,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind { ; RV64IZFHMIN-LABEL: br_fcmp_ord: ; RV64IZFHMIN: # %bb.0: ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa0 ; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; RV64IZFHMIN-NEXT: feq.s a1, fa4, fa4 ; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: bnez a0, .LBB8_2 ; RV64IZFHMIN-NEXT: # %bb.1: # %if.else @@ -957,8 +957,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind { ; RV32IZHINXMIN-LABEL: br_fcmp_ord: ; RV32IZHINXMIN: # %bb.0: ; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV32IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 ; RV32IZHINXMIN-NEXT: and a0, a0, a1 ; RV32IZHINXMIN-NEXT: bnez a0, .LBB8_2 @@ -972,8 +972,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind { ; RV64IZHINXMIN-LABEL: br_fcmp_ord: ; RV64IZHINXMIN: # %bb.0: ; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV64IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 ; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: bnez a0, .LBB8_2 @@ -1694,9 +1694,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind { ; RV32IZFHMIN-LABEL: br_fcmp_uno: ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa0 ; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; RV32IZFHMIN-NEXT: feq.s a1, fa4, fa4 ; RV32IZFHMIN-NEXT: and a0, a1, a0 ; RV32IZFHMIN-NEXT: beqz a0, .LBB15_2 ; RV32IZFHMIN-NEXT: # %bb.1: # %if.else @@ -1709,9 +1709,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind { ; RV64IZFHMIN-LABEL: br_fcmp_uno: ; RV64IZFHMIN: # %bb.0: ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa1 +; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa0 ; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; RV64IZFHMIN-NEXT: feq.s a1, fa4, fa4 ; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: beqz a0, .LBB15_2 ; RV64IZFHMIN-NEXT: # %bb.1: # %if.else @@ -1724,8 +1724,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind { ; RV32IZHINXMIN-LABEL: br_fcmp_uno: ; RV32IZHINXMIN: # %bb.0: ; RV32IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV32IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV32IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 ; RV32IZHINXMIN-NEXT: and a0, a0, a1 ; RV32IZHINXMIN-NEXT: beqz a0, .LBB15_2 @@ -1739,8 +1739,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind { ; RV64IZHINXMIN-LABEL: br_fcmp_uno: ; RV64IZHINXMIN: # %bb.0: ; RV64IZHINXMIN-NEXT: fcvt.s.h a1, a1 -; RV64IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a1, a1 ; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 ; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: beqz a0, .LBB15_2 diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index 0c84a08f1fd45..01ffcab1a6556 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -194,13 +194,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_si_h_sat: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: feq.s a0, fa5, fa5 -; RV32IZFH-NEXT: neg a0, a0 -; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32IZFH-NEXT: lui a1, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a1 +; RV32IZFH-NEXT: lui a0, %hi(.LCPI1_0) +; RV32IZFH-NEXT: feq.s a1, fa5, fa5 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV32IZFH-NEXT: lui a0, 815104 +; RV32IZFH-NEXT: fmv.w.x fa3, a0 ; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV32IZFH-NEXT: neg a0, a1 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -209,13 +209,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_si_h_sat: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: feq.s a0, fa5, fa5 -; RV64IZFH-NEXT: neg a0, a0 -; RV64IZFH-NEXT: lui a1, %hi(.LCPI1_0) -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64IZFH-NEXT: lui a1, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a1 +; RV64IZFH-NEXT: lui a0, %hi(.LCPI1_0) +; RV64IZFH-NEXT: feq.s a1, fa5, fa5 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV64IZFH-NEXT: lui a0, 815104 +; RV64IZFH-NEXT: fmv.w.x fa3, a0 ; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV64IZFH-NEXT: neg a0, a1 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -224,13 +224,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_si_h_sat: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 -; RV32IDZFH-NEXT: neg a0, a0 -; RV32IDZFH-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32IDZFH-NEXT: lui a1, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a1 +; RV32IDZFH-NEXT: lui a0, %hi(.LCPI1_0) +; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV32IDZFH-NEXT: lui a0, 815104 +; RV32IDZFH-NEXT: fmv.w.x fa3, a0 ; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV32IDZFH-NEXT: neg a0, a1 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -239,13 +239,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_si_h_sat: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 -; RV64IDZFH-NEXT: neg a0, a0 -; RV64IDZFH-NEXT: lui a1, %hi(.LCPI1_0) -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64IDZFH-NEXT: lui a1, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a1 +; RV64IDZFH-NEXT: lui a0, %hi(.LCPI1_0) +; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV64IDZFH-NEXT: lui a0, 815104 +; RV64IDZFH-NEXT: fmv.w.x fa3, a0 ; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV64IDZFH-NEXT: neg a0, a1 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -254,57 +254,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZHINX-LABEL: fcvt_si_h_sat: ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZHINX-NEXT: neg a1, a1 -; RV32IZHINX-NEXT: lui a2, 815104 -; RV32IZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZHINX-NEXT: lui a1, 815104 ; RV32IZHINX-NEXT: lui a2, 290816 +; RV32IZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZHINX-NEXT: feq.s a0, a0, a0 ; RV32IZHINX-NEXT: addi a2, a2, -512 -; RV32IZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZHINX-NEXT: and a0, a1, a0 +; RV32IZHINX-NEXT: neg a0, a0 +; RV32IZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZHINX-NEXT: and a0, a0, a1 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_si_h_sat: ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZHINX-NEXT: neg a1, a1 -; RV64IZHINX-NEXT: lui a2, 815104 -; RV64IZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZHINX-NEXT: lui a1, 815104 ; RV64IZHINX-NEXT: lui a2, 290816 +; RV64IZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZHINX-NEXT: feq.s a0, a0, a0 ; RV64IZHINX-NEXT: addiw a2, a2, -512 -; RV64IZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: neg a0, a0 +; RV64IZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_si_h_sat: ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZDINXZHINX-NEXT: neg a1, a1 -; RV32IZDINXZHINX-NEXT: lui a2, 815104 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZDINXZHINX-NEXT: lui a1, 815104 ; RV32IZDINXZHINX-NEXT: lui a2, 290816 +; RV32IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZDINXZHINX-NEXT: feq.s a0, a0, a0 ; RV32IZDINXZHINX-NEXT: addi a2, a2, -512 -; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZDINXZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZDINXZHINX-NEXT: and a0, a1, a0 +; RV32IZDINXZHINX-NEXT: neg a0, a0 +; RV32IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZDINXZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZDINXZHINX-NEXT: and a0, a0, a1 ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_si_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZDINXZHINX-NEXT: neg a1, a1 -; RV64IZDINXZHINX-NEXT: lui a2, 815104 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZDINXZHINX-NEXT: lui a1, 815104 ; RV64IZDINXZHINX-NEXT: lui a2, 290816 +; RV64IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZDINXZHINX-NEXT: feq.s a0, a0, a0 ; RV64IZDINXZHINX-NEXT: addiw a2, a2, -512 -; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZDINXZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZDINXZHINX-NEXT: and a0, a1, a0 +; RV64IZDINXZHINX-NEXT: neg a0, a0 +; RV64IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZDINXZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZDINXZHINX-NEXT: and a0, a0, a1 ; RV64IZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_si_h_sat: @@ -399,13 +399,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 -; RV32ID-ILP32-NEXT: neg a0, a0 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI1_0) -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32ID-ILP32-NEXT: lui a1, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a1 +; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI1_0) +; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV32ID-ILP32-NEXT: lui a0, 815104 +; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 ; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 +; RV32ID-ILP32-NEXT: neg a0, a1 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -419,13 +419,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 -; RV64ID-LP64-NEXT: neg a0, a0 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI1_0) -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64ID-LP64-NEXT: lui a1, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a1 +; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI1_0) +; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; RV64ID-LP64-NEXT: lui a0, 815104 +; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 ; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 +; RV64ID-LP64-NEXT: neg a0, a1 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -439,12 +439,12 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV32ID-NEXT: lui a1, 815104 ; RV32ID-NEXT: fmv.w.x fa4, a1 ; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: fmin.s fa5, fa4, fa5 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 @@ -458,12 +458,12 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV64ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV64ID-NEXT: lui a1, 815104 ; RV64ID-NEXT: fmv.w.x fa4, a1 ; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: fmin.s fa5, fa4, fa5 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 @@ -474,13 +474,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: neg a0, a0 -; CHECK32-IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; CHECK32-IZFHMIN-NEXT: lui a1, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a1 +; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; CHECK32-IZFHMIN-NEXT: lui a0, 815104 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 ; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK32-IZFHMIN-NEXT: neg a0, a1 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -489,13 +489,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: neg a0, a0 -; CHECK64-IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; CHECK64-IZFHMIN-NEXT: lui a1, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a1 +; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) +; CHECK64-IZFHMIN-NEXT: lui a0, 815104 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 ; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK64-IZFHMIN-NEXT: neg a0, a1 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -504,57 +504,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-LABEL: fcvt_si_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZHINXMIN-NEXT: lui a2, 815104 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZHINXMIN-NEXT: lui a1, 815104 ; CHECK32-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK32-IZHINXMIN-NEXT: addi a2, a2, -512 -; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_si_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZHINXMIN-NEXT: lui a2, 815104 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZHINXMIN-NEXT: lui a1, 815104 ; CHECK64-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK64-IZHINXMIN-NEXT: addiw a2, a2, -512 -; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 815104 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 815104 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a2, -512 -; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 815104 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 815104 ; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: addiw a2, a2, -512 -; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a) @@ -756,8 +756,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32IZHINX-LABEL: fcvt_ui_h_sat: ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZHINX-NEXT: lui a1, 292864 +; RV32IZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZHINX-NEXT: addi a1, a1, -256 ; RV32IZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZHINX-NEXT: fcvt.wu.s a0, a0, rtz @@ -766,8 +766,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64IZHINX-LABEL: fcvt_ui_h_sat: ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZHINX-NEXT: lui a1, 292864 +; RV64IZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZHINX-NEXT: addiw a1, a1, -256 ; RV64IZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZHINX-NEXT: fcvt.lu.s a0, a0, rtz @@ -776,8 +776,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-LABEL: fcvt_ui_h_sat: ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZDINXZHINX-NEXT: lui a1, 292864 +; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZDINXZHINX-NEXT: addi a1, a1, -256 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZDINXZHINX-NEXT: fcvt.wu.s a0, a0, rtz @@ -786,8 +786,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64IZDINXZHINX-LABEL: fcvt_ui_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZDINXZHINX-NEXT: lui a1, 292864 +; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZDINXZHINX-NEXT: addiw a1, a1, -256 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZDINXZHINX-NEXT: fcvt.lu.s a0, a0, rtz @@ -956,8 +956,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz @@ -966,8 +966,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; CHECK64-IZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz @@ -976,8 +976,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz @@ -986,8 +986,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZDINXZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz @@ -2153,7 +2153,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB10_2 ; RV32IZFH-NEXT: # %bb.1: # %start @@ -2161,19 +2161,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: .LBB10_2: # %start ; RV32IZFH-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB10_4 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB10_4 ; RV32IZFH-NEXT: # %bb.3: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB10_4: # %start -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 ; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: neg a3, s0 +; RV32IZFH-NEXT: neg a1, s0 +; RV32IZFH-NEXT: neg a3, a3 +; RV32IZFH-NEXT: and a0, a1, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a4, a0 ; RV32IZFH-NEXT: and a0, a3, a0 -; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a0, a4, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2201,7 +2201,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IDZFH-NEXT: fmv.s fa0, fs0 ; RV32IDZFH-NEXT: call __fixsfdi -; RV32IDZFH-NEXT: lui a4, 524288 +; RV32IDZFH-NEXT: lui a3, 524288 ; RV32IDZFH-NEXT: lui a2, 524288 ; RV32IDZFH-NEXT: beqz s0, .LBB10_2 ; RV32IDZFH-NEXT: # %bb.1: # %start @@ -2209,19 +2209,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: .LBB10_2: # %start ; RV32IDZFH-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IDZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IDZFH-NEXT: beqz a3, .LBB10_4 +; RV32IDZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IDZFH-NEXT: beqz a1, .LBB10_4 ; RV32IDZFH-NEXT: # %bb.3: -; RV32IDZFH-NEXT: addi a2, a4, -1 +; RV32IDZFH-NEXT: addi a2, a3, -1 ; RV32IDZFH-NEXT: .LBB10_4: # %start -; RV32IDZFH-NEXT: feq.s a1, fs0, fs0 +; RV32IDZFH-NEXT: feq.s a3, fs0, fs0 ; RV32IDZFH-NEXT: neg a4, a1 -; RV32IDZFH-NEXT: and a1, a4, a2 -; RV32IDZFH-NEXT: neg a2, a3 -; RV32IDZFH-NEXT: neg a3, s0 +; RV32IDZFH-NEXT: neg a1, s0 +; RV32IDZFH-NEXT: neg a3, a3 +; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: and a1, a3, a2 +; RV32IDZFH-NEXT: or a0, a4, a0 ; RV32IDZFH-NEXT: and a0, a3, a0 -; RV32IDZFH-NEXT: or a0, a2, a0 -; RV32IDZFH-NEXT: and a0, a4, a0 ; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -2248,7 +2248,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB10_2 ; RV32IZHINX-NEXT: # %bb.1: # %start @@ -2256,19 +2256,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZHINX-NEXT: .LBB10_2: # %start ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB10_4 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB10_4 ; RV32IZHINX-NEXT: # %bb.3: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB10_4: # %start -; RV32IZHINX-NEXT: feq.s a1, s0, s0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 ; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: neg a3, s1 +; RV32IZHINX-NEXT: neg a1, s1 +; RV32IZHINX-NEXT: neg a3, a3 +; RV32IZHINX-NEXT: and a0, a1, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a4, a0 ; RV32IZHINX-NEXT: and a0, a3, a0 -; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a0, a4, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2295,7 +2295,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZDINXZHINX-NEXT: mv a0, s0 ; RV32IZDINXZHINX-NEXT: call __fixsfdi -; RV32IZDINXZHINX-NEXT: lui a4, 524288 +; RV32IZDINXZHINX-NEXT: lui a3, 524288 ; RV32IZDINXZHINX-NEXT: lui a2, 524288 ; RV32IZDINXZHINX-NEXT: beqz s1, .LBB10_2 ; RV32IZDINXZHINX-NEXT: # %bb.1: # %start @@ -2303,19 +2303,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-NEXT: .LBB10_2: # %start ; RV32IZDINXZHINX-NEXT: lui a1, 389120 ; RV32IZDINXZHINX-NEXT: addi a1, a1, -1 -; RV32IZDINXZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZDINXZHINX-NEXT: beqz a3, .LBB10_4 +; RV32IZDINXZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZDINXZHINX-NEXT: beqz a1, .LBB10_4 ; RV32IZDINXZHINX-NEXT: # %bb.3: -; RV32IZDINXZHINX-NEXT: addi a2, a4, -1 +; RV32IZDINXZHINX-NEXT: addi a2, a3, -1 ; RV32IZDINXZHINX-NEXT: .LBB10_4: # %start -; RV32IZDINXZHINX-NEXT: feq.s a1, s0, s0 +; RV32IZDINXZHINX-NEXT: feq.s a3, s0, s0 ; RV32IZDINXZHINX-NEXT: neg a4, a1 -; RV32IZDINXZHINX-NEXT: and a1, a4, a2 -; RV32IZDINXZHINX-NEXT: neg a2, a3 -; RV32IZDINXZHINX-NEXT: neg a3, s1 +; RV32IZDINXZHINX-NEXT: neg a1, s1 +; RV32IZDINXZHINX-NEXT: neg a3, a3 +; RV32IZDINXZHINX-NEXT: and a0, a1, a0 +; RV32IZDINXZHINX-NEXT: and a1, a3, a2 +; RV32IZDINXZHINX-NEXT: or a0, a4, a0 ; RV32IZDINXZHINX-NEXT: and a0, a3, a0 -; RV32IZDINXZHINX-NEXT: or a0, a2, a0 -; RV32IZDINXZHINX-NEXT: and a0, a4, a0 ; RV32IZDINXZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZDINXZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZDINXZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2342,13 +2342,13 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: lui a1, 913408 ; RV32I-NEXT: call __gesf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __fixsfdi -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: mv s3, a1 ; RV32I-NEXT: lui s5, 524288 ; RV32I-NEXT: bgez s0, .LBB10_2 @@ -2357,25 +2357,25 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32I-NEXT: .LBB10_2: # %start ; RV32I-NEXT: lui a1, 389120 ; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __gtsf2 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: blez a0, .LBB10_4 ; RV32I-NEXT: # %bb.3: # %start ; RV32I-NEXT: addi s3, s5, -1 ; RV32I-NEXT: .LBB10_4: # %start -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __unordsf2 ; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: sgtz a1, s4 +; RV32I-NEXT: slti a2, s0, 0 ; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: neg a3, a1 +; RV32I-NEXT: addi a2, a2, -1 ; RV32I-NEXT: and a1, a0, s3 -; RV32I-NEXT: sgtz a2, s4 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: slti a3, s0, 0 -; RV32I-NEXT: addi a3, a3, -1 -; RV32I-NEXT: and a3, a3, s2 -; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: and a2, a2, s1 +; RV32I-NEXT: or a2, a3, a2 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -2444,7 +2444,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: fsw fa4, 4(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: fle.s s0, fa5, fa4 ; RV32ID-ILP32-NEXT: call __fixsfdi -; RV32ID-ILP32-NEXT: lui a4, 524288 +; RV32ID-ILP32-NEXT: lui a3, 524288 ; RV32ID-ILP32-NEXT: lui a2, 524288 ; RV32ID-ILP32-NEXT: beqz s0, .LBB10_2 ; RV32ID-ILP32-NEXT: # %bb.1: # %start @@ -2453,20 +2453,20 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI10_0)(a1) ; RV32ID-ILP32-NEXT: flw fa4, 4(sp) # 4-byte Folded Reload -; RV32ID-ILP32-NEXT: flt.s a3, fa5, fa4 +; RV32ID-ILP32-NEXT: flt.s a1, fa5, fa4 ; RV32ID-ILP32-NEXT: fmv.s fa5, fa4 -; RV32ID-ILP32-NEXT: beqz a3, .LBB10_4 +; RV32ID-ILP32-NEXT: beqz a1, .LBB10_4 ; RV32ID-ILP32-NEXT: # %bb.3: -; RV32ID-ILP32-NEXT: addi a2, a4, -1 +; RV32ID-ILP32-NEXT: addi a2, a3, -1 ; RV32ID-ILP32-NEXT: .LBB10_4: # %start -; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 +; RV32ID-ILP32-NEXT: feq.s a3, fa5, fa5 ; RV32ID-ILP32-NEXT: neg a4, a1 -; RV32ID-ILP32-NEXT: and a1, a4, a2 -; RV32ID-ILP32-NEXT: neg a2, a3 -; RV32ID-ILP32-NEXT: neg a3, s0 +; RV32ID-ILP32-NEXT: neg a1, s0 +; RV32ID-ILP32-NEXT: neg a3, a3 +; RV32ID-ILP32-NEXT: and a0, a1, a0 +; RV32ID-ILP32-NEXT: and a1, a3, a2 +; RV32ID-ILP32-NEXT: or a0, a4, a0 ; RV32ID-ILP32-NEXT: and a0, a3, a0 -; RV32ID-ILP32-NEXT: or a0, a2, a0 -; RV32ID-ILP32-NEXT: and a0, a4, a0 ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -2499,7 +2499,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-NEXT: fmv.w.x fa5, a0 ; RV32ID-NEXT: fle.s s0, fa5, fa0 ; RV32ID-NEXT: call __fixsfdi -; RV32ID-NEXT: lui a4, 524288 +; RV32ID-NEXT: lui a3, 524288 ; RV32ID-NEXT: lui a2, 524288 ; RV32ID-NEXT: beqz s0, .LBB10_2 ; RV32ID-NEXT: # %bb.1: # %start @@ -2507,19 +2507,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-NEXT: .LBB10_2: # %start ; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32ID-NEXT: flt.s a3, fa5, fs0 -; RV32ID-NEXT: beqz a3, .LBB10_4 +; RV32ID-NEXT: flt.s a1, fa5, fs0 +; RV32ID-NEXT: beqz a1, .LBB10_4 ; RV32ID-NEXT: # %bb.3: -; RV32ID-NEXT: addi a2, a4, -1 +; RV32ID-NEXT: addi a2, a3, -1 ; RV32ID-NEXT: .LBB10_4: # %start -; RV32ID-NEXT: feq.s a1, fs0, fs0 -; RV32ID-NEXT: neg a4, a1 -; RV32ID-NEXT: and a1, a4, a2 -; RV32ID-NEXT: neg a2, s0 -; RV32ID-NEXT: and a0, a2, a0 -; RV32ID-NEXT: neg a2, a3 -; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: feq.s a3, fs0, fs0 +; RV32ID-NEXT: neg a4, s0 +; RV32ID-NEXT: neg a5, a1 +; RV32ID-NEXT: neg a3, a3 ; RV32ID-NEXT: and a0, a4, a0 +; RV32ID-NEXT: and a1, a3, a2 +; RV32ID-NEXT: or a0, a5, a0 +; RV32ID-NEXT: and a0, a3, a0 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -2552,7 +2552,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IFZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IFZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IFZFHMIN-NEXT: call __fixsfdi -; RV32IFZFHMIN-NEXT: lui a4, 524288 +; RV32IFZFHMIN-NEXT: lui a3, 524288 ; RV32IFZFHMIN-NEXT: lui a2, 524288 ; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_2 ; RV32IFZFHMIN-NEXT: # %bb.1: # %start @@ -2560,19 +2560,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IFZFHMIN-NEXT: .LBB10_2: # %start ; RV32IFZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IFZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IFZFHMIN-NEXT: beqz a3, .LBB10_4 +; RV32IFZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IFZFHMIN-NEXT: beqz a1, .LBB10_4 ; RV32IFZFHMIN-NEXT: # %bb.3: -; RV32IFZFHMIN-NEXT: addi a2, a4, -1 +; RV32IFZFHMIN-NEXT: addi a2, a3, -1 ; RV32IFZFHMIN-NEXT: .LBB10_4: # %start -; RV32IFZFHMIN-NEXT: feq.s a1, fs0, fs0 +; RV32IFZFHMIN-NEXT: feq.s a3, fs0, fs0 ; RV32IFZFHMIN-NEXT: neg a4, a1 -; RV32IFZFHMIN-NEXT: and a1, a4, a2 -; RV32IFZFHMIN-NEXT: neg a2, a3 -; RV32IFZFHMIN-NEXT: neg a3, s0 +; RV32IFZFHMIN-NEXT: neg a1, s0 +; RV32IFZFHMIN-NEXT: neg a3, a3 +; RV32IFZFHMIN-NEXT: and a0, a1, a0 +; RV32IFZFHMIN-NEXT: and a1, a3, a2 +; RV32IFZFHMIN-NEXT: or a0, a4, a0 ; RV32IFZFHMIN-NEXT: and a0, a3, a0 -; RV32IFZFHMIN-NEXT: or a0, a2, a0 -; RV32IFZFHMIN-NEXT: and a0, a4, a0 ; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IFZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2601,7 +2601,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IDZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IDZFHMIN-NEXT: call __fixsfdi -; RV32IDZFHMIN-NEXT: lui a4, 524288 +; RV32IDZFHMIN-NEXT: lui a3, 524288 ; RV32IDZFHMIN-NEXT: lui a2, 524288 ; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_2 ; RV32IDZFHMIN-NEXT: # %bb.1: # %start @@ -2609,19 +2609,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFHMIN-NEXT: .LBB10_2: # %start ; RV32IDZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) ; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) -; RV32IDZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IDZFHMIN-NEXT: beqz a3, .LBB10_4 +; RV32IDZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IDZFHMIN-NEXT: beqz a1, .LBB10_4 ; RV32IDZFHMIN-NEXT: # %bb.3: -; RV32IDZFHMIN-NEXT: addi a2, a4, -1 +; RV32IDZFHMIN-NEXT: addi a2, a3, -1 ; RV32IDZFHMIN-NEXT: .LBB10_4: # %start -; RV32IDZFHMIN-NEXT: feq.s a1, fs0, fs0 +; RV32IDZFHMIN-NEXT: feq.s a3, fs0, fs0 ; RV32IDZFHMIN-NEXT: neg a4, a1 -; RV32IDZFHMIN-NEXT: and a1, a4, a2 -; RV32IDZFHMIN-NEXT: neg a2, a3 -; RV32IDZFHMIN-NEXT: neg a3, s0 +; RV32IDZFHMIN-NEXT: neg a1, s0 +; RV32IDZFHMIN-NEXT: neg a3, a3 +; RV32IDZFHMIN-NEXT: and a0, a1, a0 +; RV32IDZFHMIN-NEXT: and a1, a3, a2 +; RV32IDZFHMIN-NEXT: or a0, a4, a0 ; RV32IDZFHMIN-NEXT: and a0, a3, a0 -; RV32IDZFHMIN-NEXT: or a0, a2, a0 -; RV32IDZFHMIN-NEXT: and a0, a4, a0 ; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IDZFHMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload @@ -2639,7 +2639,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-NEXT: fle.s s1, a0, s0 ; CHECK32-IZHINXMIN-NEXT: mv a0, s0 ; CHECK32-IZHINXMIN-NEXT: call __fixsfdi -; CHECK32-IZHINXMIN-NEXT: lui a4, 524288 +; CHECK32-IZHINXMIN-NEXT: lui a3, 524288 ; CHECK32-IZHINXMIN-NEXT: lui a2, 524288 ; CHECK32-IZHINXMIN-NEXT: beqz s1, .LBB10_2 ; CHECK32-IZHINXMIN-NEXT: # %bb.1: # %start @@ -2647,19 +2647,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-NEXT: .LBB10_2: # %start ; CHECK32-IZHINXMIN-NEXT: lui a1, 389120 ; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZHINXMIN-NEXT: flt.s a3, a1, s0 -; CHECK32-IZHINXMIN-NEXT: beqz a3, .LBB10_4 +; CHECK32-IZHINXMIN-NEXT: flt.s a1, a1, s0 +; CHECK32-IZHINXMIN-NEXT: beqz a1, .LBB10_4 ; CHECK32-IZHINXMIN-NEXT: # %bb.3: -; CHECK32-IZHINXMIN-NEXT: addi a2, a4, -1 +; CHECK32-IZHINXMIN-NEXT: addi a2, a3, -1 ; CHECK32-IZHINXMIN-NEXT: .LBB10_4: # %start -; CHECK32-IZHINXMIN-NEXT: feq.s a1, s0, s0 +; CHECK32-IZHINXMIN-NEXT: feq.s a3, s0, s0 ; CHECK32-IZHINXMIN-NEXT: neg a4, a1 -; CHECK32-IZHINXMIN-NEXT: and a1, a4, a2 -; CHECK32-IZHINXMIN-NEXT: neg a2, a3 -; CHECK32-IZHINXMIN-NEXT: neg a3, s1 +; CHECK32-IZHINXMIN-NEXT: neg a1, s1 +; CHECK32-IZHINXMIN-NEXT: neg a3, a3 +; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZHINXMIN-NEXT: and a1, a3, a2 +; CHECK32-IZHINXMIN-NEXT: or a0, a4, a0 ; CHECK32-IZHINXMIN-NEXT: and a0, a3, a0 -; CHECK32-IZHINXMIN-NEXT: or a0, a2, a0 -; CHECK32-IZHINXMIN-NEXT: and a0, a4, a0 ; CHECK32-IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32-IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; CHECK32-IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2687,7 +2687,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: fle.s s1, a0, s0 ; CHECK32-IZDINXZHINXMIN-NEXT: mv a0, s0 ; CHECK32-IZDINXZHINXMIN-NEXT: call __fixsfdi -; CHECK32-IZDINXZHINXMIN-NEXT: lui a4, 524288 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a3, 524288 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 524288 ; CHECK32-IZDINXZHINXMIN-NEXT: beqz s1, .LBB10_2 ; CHECK32-IZDINXZHINXMIN-NEXT: # %bb.1: # %start @@ -2695,19 +2695,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: .LBB10_2: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 389120 ; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a3, a1, s0 -; CHECK32-IZDINXZHINXMIN-NEXT: beqz a3, .LBB10_4 +; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a1, a1, s0 +; CHECK32-IZDINXZHINXMIN-NEXT: beqz a1, .LBB10_4 ; CHECK32-IZDINXZHINXMIN-NEXT: # %bb.3: -; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a4, -1 +; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a3, -1 ; CHECK32-IZDINXZHINXMIN-NEXT: .LBB10_4: # %start -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, s0, s0 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a3, s0, s0 ; CHECK32-IZDINXZHINXMIN-NEXT: neg a4, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: and a1, a4, a2 -; CHECK32-IZDINXZHINXMIN-NEXT: neg a2, a3 -; CHECK32-IZDINXZHINXMIN-NEXT: neg a3, s1 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, s1 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a3, a3 +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: and a1, a3, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: or a0, a4, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a3, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: or a0, a2, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a4, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32-IZDINXZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; CHECK32-IZDINXZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2907,15 +2907,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: lui a0, %hi(.LCPI12_0) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) ; RV32IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IZFH-NEXT: flt.s a0, fa5, fa0 -; RV32IZFH-NEXT: neg s0, a0 -; RV32IZFH-NEXT: fmv.w.x fa5, zero -; RV32IZFH-NEXT: fle.s a0, fa5, fa0 +; RV32IZFH-NEXT: fmv.w.x fa4, zero +; RV32IZFH-NEXT: fle.s a0, fa4, fa0 +; RV32IZFH-NEXT: flt.s a1, fa5, fa0 +; RV32IZFH-NEXT: neg s0, a1 ; RV32IZFH-NEXT: neg s1, a0 ; RV32IZFH-NEXT: call __fixunssfdi ; RV32IZFH-NEXT: and a0, s1, a0 -; RV32IZFH-NEXT: or a0, s0, a0 ; RV32IZFH-NEXT: and a1, s1, a1 +; RV32IZFH-NEXT: or a0, s0, a0 ; RV32IZFH-NEXT: or a1, s0, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2941,15 +2941,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: lui a0, %hi(.LCPI12_0) ; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) ; RV32IDZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IDZFH-NEXT: flt.s a0, fa5, fa0 -; RV32IDZFH-NEXT: neg s0, a0 -; RV32IDZFH-NEXT: fmv.w.x fa5, zero -; RV32IDZFH-NEXT: fle.s a0, fa5, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa4, zero +; RV32IDZFH-NEXT: fle.s a0, fa4, fa0 +; RV32IDZFH-NEXT: flt.s a1, fa5, fa0 +; RV32IDZFH-NEXT: neg s0, a1 ; RV32IDZFH-NEXT: neg s1, a0 ; RV32IDZFH-NEXT: call __fixunssfdi ; RV32IDZFH-NEXT: and a0, s1, a0 -; RV32IDZFH-NEXT: or a0, s0, a0 ; RV32IDZFH-NEXT: and a1, s1, a1 +; RV32IDZFH-NEXT: or a0, s0, a0 ; RV32IDZFH-NEXT: or a1, s0, a1 ; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2975,14 +2975,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: lui a1, 391168 ; RV32IZHINX-NEXT: addi a1, a1, -1 +; RV32IZHINX-NEXT: fle.s a2, zero, a0 ; RV32IZHINX-NEXT: flt.s a1, a1, a0 ; RV32IZHINX-NEXT: neg s0, a1 -; RV32IZHINX-NEXT: fle.s a1, zero, a0 -; RV32IZHINX-NEXT: neg s1, a1 +; RV32IZHINX-NEXT: neg s1, a2 ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 -; RV32IZHINX-NEXT: or a0, s0, a0 ; RV32IZHINX-NEXT: and a1, s1, a1 +; RV32IZHINX-NEXT: or a0, s0, a0 ; RV32IZHINX-NEXT: or a1, s0, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3008,14 +3008,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZDINXZHINX-NEXT: lui a1, 391168 ; RV32IZDINXZHINX-NEXT: addi a1, a1, -1 +; RV32IZDINXZHINX-NEXT: fle.s a2, zero, a0 ; RV32IZDINXZHINX-NEXT: flt.s a1, a1, a0 ; RV32IZDINXZHINX-NEXT: neg s0, a1 -; RV32IZDINXZHINX-NEXT: fle.s a1, zero, a0 -; RV32IZDINXZHINX-NEXT: neg s1, a1 +; RV32IZDINXZHINX-NEXT: neg s1, a2 ; RV32IZDINXZHINX-NEXT: call __fixunssfdi ; RV32IZDINXZHINX-NEXT: and a0, s1, a0 -; RV32IZDINXZHINX-NEXT: or a0, s0, a0 ; RV32IZDINXZHINX-NEXT: and a1, s1, a1 +; RV32IZDINXZHINX-NEXT: or a0, s0, a0 ; RV32IZDINXZHINX-NEXT: or a1, s0, a1 ; RV32IZDINXZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZDINXZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3054,8 +3054,8 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __fixunssfdi ; RV32I-NEXT: and a0, s2, a0 -; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: and a1, s2, a1 +; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: or a1, s1, a1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3106,15 +3106,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI12_0) ; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI12_0)(a1) ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: flt.s a1, fa5, fa4 -; RV32ID-ILP32-NEXT: neg s0, a1 -; RV32ID-ILP32-NEXT: fmv.w.x fa5, zero -; RV32ID-ILP32-NEXT: fle.s a1, fa5, fa4 +; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero +; RV32ID-ILP32-NEXT: fle.s a1, fa3, fa4 +; RV32ID-ILP32-NEXT: flt.s a2, fa5, fa4 +; RV32ID-ILP32-NEXT: neg s0, a2 ; RV32ID-ILP32-NEXT: neg s1, a1 ; RV32ID-ILP32-NEXT: call __fixunssfdi ; RV32ID-ILP32-NEXT: and a0, s1, a0 -; RV32ID-ILP32-NEXT: or a0, s0, a0 ; RV32ID-ILP32-NEXT: and a1, s1, a1 +; RV32ID-ILP32-NEXT: or a0, s0, a0 ; RV32ID-ILP32-NEXT: or a1, s0, a1 ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3146,15 +3146,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: lui a0, %hi(.LCPI12_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32ID-NEXT: flt.s a0, fa5, fa0 -; RV32ID-NEXT: neg s0, a0 -; RV32ID-NEXT: fmv.w.x fa5, zero -; RV32ID-NEXT: fle.s a0, fa5, fa0 +; RV32ID-NEXT: fmv.w.x fa4, zero +; RV32ID-NEXT: fle.s a0, fa4, fa0 +; RV32ID-NEXT: flt.s a1, fa5, fa0 +; RV32ID-NEXT: neg s0, a1 ; RV32ID-NEXT: neg s1, a0 ; RV32ID-NEXT: call __fixunssfdi ; RV32ID-NEXT: and a0, s1, a0 -; RV32ID-NEXT: or a0, s0, a0 ; RV32ID-NEXT: and a1, s1, a1 +; RV32ID-NEXT: or a0, s0, a0 ; RV32ID-NEXT: or a1, s0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3185,15 +3185,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa0, fa0 -; CHECK32-IZFHMIN-NEXT: flt.s a0, fa5, fa0 -; CHECK32-IZFHMIN-NEXT: neg s0, a0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa5, zero -; CHECK32-IZFHMIN-NEXT: fle.s a0, fa5, fa0 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK32-IZFHMIN-NEXT: fle.s a0, fa4, fa0 +; CHECK32-IZFHMIN-NEXT: flt.s a1, fa5, fa0 +; CHECK32-IZFHMIN-NEXT: neg s0, a1 ; CHECK32-IZFHMIN-NEXT: neg s1, a0 ; CHECK32-IZFHMIN-NEXT: call __fixunssfdi ; CHECK32-IZFHMIN-NEXT: and a0, s1, a0 -; CHECK32-IZFHMIN-NEXT: or a0, s0, a0 ; CHECK32-IZFHMIN-NEXT: and a1, s1, a1 +; CHECK32-IZFHMIN-NEXT: or a0, s0, a0 ; CHECK32-IZFHMIN-NEXT: or a1, s0, a1 ; CHECK32-IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32-IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3220,14 +3220,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZHINXMIN-NEXT: lui a1, 391168 ; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -1 +; CHECK32-IZHINXMIN-NEXT: fle.s a2, zero, a0 ; CHECK32-IZHINXMIN-NEXT: flt.s a1, a1, a0 ; CHECK32-IZHINXMIN-NEXT: neg s0, a1 -; CHECK32-IZHINXMIN-NEXT: fle.s a1, zero, a0 -; CHECK32-IZHINXMIN-NEXT: neg s1, a1 +; CHECK32-IZHINXMIN-NEXT: neg s1, a2 ; CHECK32-IZHINXMIN-NEXT: call __fixunssfdi ; CHECK32-IZHINXMIN-NEXT: and a0, s1, a0 -; CHECK32-IZHINXMIN-NEXT: or a0, s0, a0 ; CHECK32-IZHINXMIN-NEXT: and a1, s1, a1 +; CHECK32-IZHINXMIN-NEXT: or a0, s0, a0 ; CHECK32-IZHINXMIN-NEXT: or a1, s0, a1 ; CHECK32-IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32-IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3254,14 +3254,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 391168 ; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -1 +; CHECK32-IZDINXZHINXMIN-NEXT: fle.s a2, zero, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a1, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: neg s0, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: fle.s a1, zero, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: neg s1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: neg s1, a2 ; CHECK32-IZDINXZHINXMIN-NEXT: call __fixunssfdi ; CHECK32-IZDINXZHINXMIN-NEXT: and a0, s1, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: or a0, s0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: and a1, s1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: or a0, s0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: or a1, s0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; CHECK32-IZDINXZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -6296,13 +6296,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: feq.s a0, fa5, fa5 -; RV32IZFH-NEXT: neg a0, a0 -; RV32IZFH-NEXT: lui a1, %hi(.LCPI32_0) -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV32IZFH-NEXT: lui a1, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a1 +; RV32IZFH-NEXT: lui a0, %hi(.LCPI32_0) +; RV32IZFH-NEXT: feq.s a1, fa5, fa5 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV32IZFH-NEXT: lui a0, 815104 +; RV32IZFH-NEXT: fmv.w.x fa3, a0 ; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV32IZFH-NEXT: neg a0, a1 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -6311,13 +6311,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: feq.s a0, fa5, fa5 -; RV64IZFH-NEXT: neg a0, a0 -; RV64IZFH-NEXT: lui a1, %hi(.LCPI32_0) -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV64IZFH-NEXT: lui a1, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a1 +; RV64IZFH-NEXT: lui a0, %hi(.LCPI32_0) +; RV64IZFH-NEXT: feq.s a1, fa5, fa5 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV64IZFH-NEXT: lui a0, 815104 +; RV64IZFH-NEXT: fmv.w.x fa3, a0 ; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV64IZFH-NEXT: neg a0, a1 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -6326,13 +6326,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 -; RV32IDZFH-NEXT: neg a0, a0 -; RV32IDZFH-NEXT: lui a1, %hi(.LCPI32_0) -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV32IDZFH-NEXT: lui a1, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a1 +; RV32IDZFH-NEXT: lui a0, %hi(.LCPI32_0) +; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV32IDZFH-NEXT: lui a0, 815104 +; RV32IDZFH-NEXT: fmv.w.x fa3, a0 ; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV32IDZFH-NEXT: neg a0, a1 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -6341,13 +6341,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 -; RV64IDZFH-NEXT: neg a0, a0 -; RV64IDZFH-NEXT: lui a1, %hi(.LCPI32_0) -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV64IDZFH-NEXT: lui a1, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a1 +; RV64IDZFH-NEXT: lui a0, %hi(.LCPI32_0) +; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV64IDZFH-NEXT: lui a0, 815104 +; RV64IDZFH-NEXT: fmv.w.x fa3, a0 ; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 +; RV64IDZFH-NEXT: neg a0, a1 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -6356,57 +6356,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZHINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZHINX-NEXT: neg a1, a1 -; RV32IZHINX-NEXT: lui a2, 815104 -; RV32IZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZHINX-NEXT: lui a1, 815104 ; RV32IZHINX-NEXT: lui a2, 290816 +; RV32IZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZHINX-NEXT: feq.s a0, a0, a0 ; RV32IZHINX-NEXT: addi a2, a2, -512 -; RV32IZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZHINX-NEXT: and a0, a1, a0 +; RV32IZHINX-NEXT: neg a0, a0 +; RV32IZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZHINX-NEXT: and a0, a0, a1 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZHINX-NEXT: neg a1, a1 -; RV64IZHINX-NEXT: lui a2, 815104 -; RV64IZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZHINX-NEXT: lui a1, 815104 ; RV64IZHINX-NEXT: lui a2, 290816 +; RV64IZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZHINX-NEXT: feq.s a0, a0, a0 ; RV64IZHINX-NEXT: addiw a2, a2, -512 -; RV64IZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: neg a0, a0 +; RV64IZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZDINXZHINX-NEXT: neg a1, a1 -; RV32IZDINXZHINX-NEXT: lui a2, 815104 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZDINXZHINX-NEXT: lui a1, 815104 ; RV32IZDINXZHINX-NEXT: lui a2, 290816 +; RV32IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZDINXZHINX-NEXT: feq.s a0, a0, a0 ; RV32IZDINXZHINX-NEXT: addi a2, a2, -512 -; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZDINXZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZDINXZHINX-NEXT: and a0, a1, a0 +; RV32IZDINXZHINX-NEXT: neg a0, a0 +; RV32IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZDINXZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZDINXZHINX-NEXT: and a0, a0, a1 ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZDINXZHINX-NEXT: neg a1, a1 -; RV64IZDINXZHINX-NEXT: lui a2, 815104 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZDINXZHINX-NEXT: lui a1, 815104 ; RV64IZDINXZHINX-NEXT: lui a2, 290816 +; RV64IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZDINXZHINX-NEXT: feq.s a0, a0, a0 ; RV64IZDINXZHINX-NEXT: addiw a2, a2, -512 -; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZDINXZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZDINXZHINX-NEXT: and a0, a1, a0 +; RV64IZDINXZHINX-NEXT: neg a0, a0 +; RV64IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZDINXZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZDINXZHINX-NEXT: and a0, a0, a1 ; RV64IZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i16: @@ -6505,13 +6505,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 -; RV32ID-ILP32-NEXT: neg a0, a0 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI32_0) -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV32ID-ILP32-NEXT: lui a1, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a1 +; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI32_0) +; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV32ID-ILP32-NEXT: lui a0, 815104 +; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 ; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 +; RV32ID-ILP32-NEXT: neg a0, a1 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -6525,13 +6525,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 -; RV64ID-LP64-NEXT: neg a0, a0 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI32_0) -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; RV64ID-LP64-NEXT: lui a1, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a1 +; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI32_0) +; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; RV64ID-LP64-NEXT: lui a0, 815104 +; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 ; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 +; RV64ID-LP64-NEXT: neg a0, a1 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -6545,12 +6545,12 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: lui a1, %hi(.LCPI32_0) ; RV32ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV32ID-NEXT: lui a1, 815104 ; RV32ID-NEXT: fmv.w.x fa4, a1 ; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: fmin.s fa5, fa4, fa5 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 @@ -6564,12 +6564,12 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: lui a1, %hi(.LCPI32_0) ; RV64ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV64ID-NEXT: lui a1, 815104 ; RV64ID-NEXT: fmv.w.x fa4, a1 ; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: fmin.s fa5, fa4, fa5 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 @@ -6580,13 +6580,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: neg a0, a0 -; CHECK32-IZFHMIN-NEXT: lui a1, %hi(.LCPI32_0) -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; CHECK32-IZFHMIN-NEXT: lui a1, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a1 +; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) +; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; CHECK32-IZFHMIN-NEXT: lui a0, 815104 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 ; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK32-IZFHMIN-NEXT: neg a0, a1 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -6595,13 +6595,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: neg a0, a0 -; CHECK64-IZFHMIN-NEXT: lui a1, %hi(.LCPI32_0) -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a1) -; CHECK64-IZFHMIN-NEXT: lui a1, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a1 +; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) +; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) +; CHECK64-IZFHMIN-NEXT: lui a0, 815104 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 ; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 +; CHECK64-IZFHMIN-NEXT: neg a0, a1 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -6610,57 +6610,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZHINXMIN-NEXT: lui a2, 815104 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZHINXMIN-NEXT: lui a1, 815104 ; CHECK32-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK32-IZHINXMIN-NEXT: addi a2, a2, -512 -; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZHINXMIN-NEXT: lui a2, 815104 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZHINXMIN-NEXT: lui a1, 815104 ; CHECK64-IZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK64-IZHINXMIN-NEXT: addiw a2, a2, -512 -; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 815104 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 815104 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a2, -512 -; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 815104 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 815104 ; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 290816 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: addiw a2, a2, -512 -; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a) @@ -6861,8 +6861,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32IZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZHINX-NEXT: lui a1, 292864 +; RV32IZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZHINX-NEXT: addi a1, a1, -256 ; RV32IZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZHINX-NEXT: fcvt.wu.s a0, a0, rtz @@ -6871,8 +6871,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64IZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZHINX-NEXT: lui a1, 292864 +; RV64IZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZHINX-NEXT: addiw a1, a1, -256 ; RV64IZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZHINX-NEXT: fcvt.lu.s a0, a0, rtz @@ -6881,8 +6881,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZDINXZHINX-NEXT: lui a1, 292864 +; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV32IZDINXZHINX-NEXT: addi a1, a1, -256 ; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV32IZDINXZHINX-NEXT: fcvt.wu.s a0, a0, rtz @@ -6891,8 +6891,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZDINXZHINX-NEXT: lui a1, 292864 +; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, zero ; RV64IZDINXZHINX-NEXT: addiw a1, a1, -256 ; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a1 ; RV64IZDINXZHINX-NEXT: fcvt.lu.s a0, a0, rtz @@ -7067,8 +7067,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz @@ -7077,8 +7077,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz @@ -7087,8 +7087,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -256 ; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz @@ -7097,8 +7097,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 292864 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, zero ; CHECK64-IZDINXZHINXMIN-NEXT: addiw a1, a1, -256 ; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz @@ -7258,12 +7258,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_w_s_sat_i8: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFH-NEXT: lui a0, 798720 +; RV32IZFH-NEXT: lui a1, 274400 +; RV32IZFH-NEXT: fmv.w.x fa4, a0 ; RV32IZFH-NEXT: feq.s a0, fa5, fa5 ; RV32IZFH-NEXT: neg a0, a0 -; RV32IZFH-NEXT: lui a1, 798720 -; RV32IZFH-NEXT: fmv.w.x fa4, a1 ; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 -; RV32IZFH-NEXT: lui a1, 274400 ; RV32IZFH-NEXT: fmv.w.x fa4, a1 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz @@ -7273,12 +7273,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_w_s_sat_i8: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFH-NEXT: lui a0, 798720 +; RV64IZFH-NEXT: lui a1, 274400 +; RV64IZFH-NEXT: fmv.w.x fa4, a0 ; RV64IZFH-NEXT: feq.s a0, fa5, fa5 ; RV64IZFH-NEXT: neg a0, a0 -; RV64IZFH-NEXT: lui a1, 798720 -; RV64IZFH-NEXT: fmv.w.x fa4, a1 ; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 -; RV64IZFH-NEXT: lui a1, 274400 ; RV64IZFH-NEXT: fmv.w.x fa4, a1 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz @@ -7288,12 +7288,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_w_s_sat_i8: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IDZFH-NEXT: lui a0, 798720 +; RV32IDZFH-NEXT: lui a1, 274400 +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 ; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 ; RV32IDZFH-NEXT: neg a0, a0 -; RV32IDZFH-NEXT: lui a1, 798720 -; RV32IDZFH-NEXT: fmv.w.x fa4, a1 ; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 -; RV32IDZFH-NEXT: lui a1, 274400 ; RV32IDZFH-NEXT: fmv.w.x fa4, a1 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz @@ -7303,12 +7303,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_w_s_sat_i8: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IDZFH-NEXT: lui a0, 798720 +; RV64IDZFH-NEXT: lui a1, 274400 +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 ; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 ; RV64IDZFH-NEXT: neg a0, a0 -; RV64IDZFH-NEXT: lui a1, 798720 -; RV64IDZFH-NEXT: fmv.w.x fa4, a1 ; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 -; RV64IDZFH-NEXT: lui a1, 274400 ; RV64IDZFH-NEXT: fmv.w.x fa4, a1 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz @@ -7318,53 +7318,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV32IZHINX-LABEL: fcvt_w_s_sat_i8: ; RV32IZHINX: # %bb.0: # %start ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZHINX-NEXT: neg a1, a1 -; RV32IZHINX-NEXT: lui a2, 798720 -; RV32IZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZHINX-NEXT: lui a1, 798720 ; RV32IZHINX-NEXT: lui a2, 274400 -; RV32IZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZHINX-NEXT: and a0, a1, a0 +; RV32IZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZHINX-NEXT: feq.s a0, a0, a0 +; RV32IZHINX-NEXT: neg a0, a0 +; RV32IZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZHINX-NEXT: and a0, a0, a1 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_w_s_sat_i8: ; RV64IZHINX: # %bb.0: # %start ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZHINX-NEXT: neg a1, a1 -; RV64IZHINX-NEXT: lui a2, 798720 -; RV64IZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZHINX-NEXT: lui a1, 798720 ; RV64IZHINX-NEXT: lui a2, 274400 -; RV64IZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZHINX-NEXT: feq.s a0, a0, a0 +; RV64IZHINX-NEXT: neg a0, a0 +; RV64IZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i8: ; RV32IZDINXZHINX: # %bb.0: # %start ; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV32IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV32IZDINXZHINX-NEXT: neg a1, a1 -; RV32IZDINXZHINX-NEXT: lui a2, 798720 -; RV32IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV32IZDINXZHINX-NEXT: lui a1, 798720 ; RV32IZDINXZHINX-NEXT: lui a2, 274400 -; RV32IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV32IZDINXZHINX-NEXT: fcvt.w.s a0, a0, rtz -; RV32IZDINXZHINX-NEXT: and a0, a1, a0 +; RV32IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV32IZDINXZHINX-NEXT: feq.s a0, a0, a0 +; RV32IZDINXZHINX-NEXT: neg a0, a0 +; RV32IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV32IZDINXZHINX-NEXT: fcvt.w.s a1, a1, rtz +; RV32IZDINXZHINX-NEXT: and a0, a0, a1 ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i8: ; RV64IZDINXZHINX: # %bb.0: # %start ; RV64IZDINXZHINX-NEXT: fcvt.s.h a0, a0 -; RV64IZDINXZHINX-NEXT: feq.s a1, a0, a0 -; RV64IZDINXZHINX-NEXT: neg a1, a1 -; RV64IZDINXZHINX-NEXT: lui a2, 798720 -; RV64IZDINXZHINX-NEXT: fmax.s a0, a0, a2 +; RV64IZDINXZHINX-NEXT: lui a1, 798720 ; RV64IZDINXZHINX-NEXT: lui a2, 274400 -; RV64IZDINXZHINX-NEXT: fmin.s a0, a0, a2 -; RV64IZDINXZHINX-NEXT: fcvt.l.s a0, a0, rtz -; RV64IZDINXZHINX-NEXT: and a0, a1, a0 +; RV64IZDINXZHINX-NEXT: fmax.s a1, a0, a1 +; RV64IZDINXZHINX-NEXT: feq.s a0, a0, a0 +; RV64IZDINXZHINX-NEXT: neg a0, a0 +; RV64IZDINXZHINX-NEXT: fmin.s a1, a1, a2 +; RV64IZDINXZHINX-NEXT: fcvt.l.s a1, a1, rtz +; RV64IZDINXZHINX-NEXT: and a0, a0, a1 ; RV64IZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i8: @@ -7459,12 +7459,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: lui a0, 798720 +; RV32ID-ILP32-NEXT: lui a1, 274400 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 ; RV32ID-ILP32-NEXT: neg a0, a0 -; RV32ID-ILP32-NEXT: lui a1, 798720 -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 -; RV32ID-ILP32-NEXT: lui a1, 274400 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz @@ -7479,12 +7479,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: lui a0, 798720 +; RV64ID-LP64-NEXT: lui a1, 274400 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 ; RV64ID-LP64-NEXT: neg a0, a0 -; RV64ID-LP64-NEXT: lui a1, 798720 -; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 -; RV64ID-LP64-NEXT: lui a1, 274400 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz @@ -7499,11 +7499,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: lui a1, 798720 ; RV32ID-NEXT: fmv.w.x fa5, a1 -; RV32ID-NEXT: fmax.s fa5, fa0, fa5 ; RV32ID-NEXT: lui a1, 274400 +; RV32ID-NEXT: neg a0, a0 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 ; RV32ID-NEXT: fmv.w.x fa4, a1 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz @@ -7518,11 +7518,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: lui a1, 798720 ; RV64ID-NEXT: fmv.w.x fa5, a1 -; RV64ID-NEXT: fmax.s fa5, fa0, fa5 ; RV64ID-NEXT: lui a1, 274400 +; RV64ID-NEXT: neg a0, a0 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 ; RV64ID-NEXT: fmv.w.x fa4, a1 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz @@ -7534,12 +7534,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK32-IZFHMIN-NEXT: lui a0, 798720 +; CHECK32-IZFHMIN-NEXT: lui a1, 274400 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 ; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 ; CHECK32-IZFHMIN-NEXT: neg a0, a0 -; CHECK32-IZFHMIN-NEXT: lui a1, 798720 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 -; CHECK32-IZFHMIN-NEXT: lui a1, 274400 ; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz @@ -7549,12 +7549,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK64-IZFHMIN-NEXT: lui a0, 798720 +; CHECK64-IZFHMIN-NEXT: lui a1, 274400 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 ; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 ; CHECK64-IZFHMIN-NEXT: neg a0, a0 -; CHECK64-IZFHMIN-NEXT: lui a1, 798720 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 -; CHECK64-IZFHMIN-NEXT: lui a1, 274400 ; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz @@ -7564,53 +7564,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind { ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZHINXMIN-NEXT: lui a2, 798720 -; CHECK32-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZHINXMIN-NEXT: lui a1, 798720 ; CHECK32-IZHINXMIN-NEXT: lui a2, 274400 -; CHECK32-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 +; CHECK32-IZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZHINXMIN-NEXT: lui a2, 798720 -; CHECK64-IZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZHINXMIN-NEXT: lui a1, 798720 ; CHECK64-IZHINXMIN-NEXT: lui a2, 274400 -; CHECK64-IZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 +; CHECK64-IZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 798720 -; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 798720 ; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 274400 -; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a1, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 -; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 798720 -; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a0, a0, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: lui a1, 798720 ; CHECK64-IZDINXZHINXMIN-NEXT: lui a2, 274400 -; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a0, a0, a2 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: fmax.s a1, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: fmin.s a1, a1, a2 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a1, a1, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i8 @llvm.fptosi.sat.i8.f16(half %a) @@ -7769,8 +7769,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 ; RV32IZFH-NEXT: fmv.w.x fa4, zero -; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV32IZFH-NEXT: lui a0, 276464 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fmv.w.x fa4, a0 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.wu.s a0, fa5, rtz @@ -7780,8 +7780,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 ; RV64IZFH-NEXT: fmv.w.x fa4, zero -; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV64IZFH-NEXT: lui a0, 276464 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fmv.w.x fa4, a0 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.lu.s a0, fa5, rtz @@ -7791,8 +7791,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 ; RV32IDZFH-NEXT: fmv.w.x fa4, zero -; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: lui a0, 276464 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fmv.w.x fa4, a0 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.wu.s a0, fa5, rtz @@ -7802,8 +7802,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 ; RV64IDZFH-NEXT: fmv.w.x fa4, zero -; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: lui a0, 276464 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fmv.w.x fa4, a0 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.lu.s a0, fa5, rtz @@ -7926,8 +7926,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero -; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: lui a0, 276464 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.wu.s a0, fa5, rtz @@ -7942,8 +7942,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 ; RV64ID-LP64-NEXT: fmv.w.x fa4, zero -; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: lui a0, 276464 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.lu.s a0, fa5, rtz @@ -7957,8 +7957,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: fmv.w.x fa5, zero -; RV32ID-NEXT: fmax.s fa5, fa0, fa5 ; RV32ID-NEXT: lui a0, 276464 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 ; RV32ID-NEXT: fmv.w.x fa4, a0 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz @@ -7972,8 +7972,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: fmv.w.x fa5, zero -; RV64ID-NEXT: fmax.s fa5, fa0, fa5 ; RV64ID-NEXT: lui a0, 276464 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 ; RV64ID-NEXT: fmv.w.x fa4, a0 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz @@ -7985,8 +7985,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: lui a0, 276464 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz @@ -7996,8 +7996,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind { ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: lui a0, 276464 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll index 4bc595bcc4cc8..12cf088e3205f 100644 --- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll @@ -297,8 +297,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind strictfp { ; CHECKIZHINXMIN-LABEL: fcmp_ord: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: ret @@ -608,8 +608,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind strictfp { ; CHECKIZHINXMIN-LABEL: fcmp_uno: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: xori a0, a0, 1 @@ -823,8 +823,8 @@ define i32 @fcmps_ord(half %a, half %b) nounwind strictfp { ; CHECKIZHINXMIN-LABEL: fcmps_ord: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fle.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fle.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fle.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: ret @@ -1063,8 +1063,8 @@ define i32 @fcmps_uno(half %a, half %b) nounwind strictfp { ; CHECKIZHINXMIN-LABEL: fcmps_uno: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fle.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: fle.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fle.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: xori a0, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/half-fcmp.ll b/llvm/test/CodeGen/RISCV/half-fcmp.ll index 81ef56635eebc..d25d8cc1c1994 100644 --- a/llvm/test/CodeGen/RISCV/half-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-fcmp.ll @@ -426,9 +426,9 @@ define i32 @fcmp_ord(half %a, half %b) nounwind { ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_ord: ; CHECKIZFHMIN-ILP32F-LP64F: # %bb.0: ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa4, fa0 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a0, fa5, fa5 -; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a1, fa5, fa5 +; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a1, fa4, fa4 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: and a0, a1, a0 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: ret ; @@ -437,8 +437,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa4, a1 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa4 ; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 ; CHECKIZFHMIN-NEXT: and a0, a1, a0 ; CHECKIZFHMIN-NEXT: ret @@ -446,8 +446,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind { ; CHECKIZHINXMIN-LABEL: fcmp_ord: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: ret @@ -864,9 +864,9 @@ define i32 @fcmp_uno(half %a, half %b) nounwind { ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_uno: ; CHECKIZFHMIN-ILP32F-LP64F: # %bb.0: ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa5, fa1 +; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa4, fa0 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a0, fa5, fa5 -; CHECKIZFHMIN-ILP32F-LP64F-NEXT: fcvt.s.h fa5, fa0 -; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a1, fa5, fa5 +; CHECKIZFHMIN-ILP32F-LP64F-NEXT: feq.s a1, fa4, fa4 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: and a0, a1, a0 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: xori a0, a0, 1 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT: ret @@ -876,8 +876,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind { ; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 ; CHECKIZFHMIN-NEXT: fmv.h.x fa4, a1 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa4 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; CHECKIZFHMIN-NEXT: feq.s a0, fa4, fa4 ; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 ; CHECKIZFHMIN-NEXT: and a0, a1, a0 ; CHECKIZFHMIN-NEXT: xori a0, a0, 1 @@ -886,8 +886,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind { ; CHECKIZHINXMIN-LABEL: fcmp_uno: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: feq.s a1, a1, a1 ; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 ; CHECKIZHINXMIN-NEXT: and a0, a0, a1 ; CHECKIZHINXMIN-NEXT: xori a0, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index f063c1fef4e16..0d26e660c979b 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -2050,8 +2050,8 @@ define half @copysign_f16(half %a, half %b) nounwind { ; RV32I-LABEL: copysign_f16: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 1048568 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 17 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret @@ -2059,8 +2059,8 @@ define half @copysign_f16(half %a, half %b) nounwind { ; RV64I-LABEL: copysign_f16: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 1048568 -; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a0, a0, 49 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -2094,8 +2094,8 @@ define half @copysign_f16(half %a, half %b) nounwind { ; RV32IZHINXMIN-NEXT: # kill: def $x11_h killed $x11_h def $x11 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32IZHINXMIN-NEXT: lui a2, 1048568 -; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: slli a0, a0, 17 +; RV32IZHINXMIN-NEXT: and a1, a1, a2 ; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: or a0, a0, a1 ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -2106,8 +2106,8 @@ define half @copysign_f16(half %a, half %b) nounwind { ; RV64IZHINXMIN-NEXT: # kill: def $x11_h killed $x11_h def $x11 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64IZHINXMIN-NEXT: lui a2, 1048568 -; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: slli a0, a0, 49 +; RV64IZHINXMIN-NEXT: and a1, a1, a2 ; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: or a0, a0, a1 ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h killed $x10 @@ -2801,8 +2801,8 @@ define i1 @isnan_d_fpclass(half %x) { ; RV32I-LABEL: isnan_d_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 17 -; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: li a1, 31 +; RV32I-NEXT: srli a0, a0, 17 ; RV32I-NEXT: slli a1, a1, 10 ; RV32I-NEXT: slt a0, a1, a0 ; RV32I-NEXT: ret @@ -2810,8 +2810,8 @@ define i1 @isnan_d_fpclass(half %x) { ; RV64I-LABEL: isnan_d_fpclass: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 49 -; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: li a1, 31 +; RV64I-NEXT: srli a0, a0, 49 ; RV64I-NEXT: slli a1, a1, 10 ; RV64I-NEXT: slt a0, a1, a0 ; RV64I-NEXT: ret @@ -2819,9 +2819,9 @@ define i1 @isnan_d_fpclass(half %x) { ; RV32IZFHMIN-LABEL: isnan_d_fpclass: ; RV32IZFHMIN: # %bb.0: ; RV32IZFHMIN-NEXT: fmv.x.h a0, fa0 +; RV32IZFHMIN-NEXT: li a1, 31 ; RV32IZFHMIN-NEXT: slli a0, a0, 17 ; RV32IZFHMIN-NEXT: srli a0, a0, 17 -; RV32IZFHMIN-NEXT: li a1, 31 ; RV32IZFHMIN-NEXT: slli a1, a1, 10 ; RV32IZFHMIN-NEXT: slt a0, a1, a0 ; RV32IZFHMIN-NEXT: ret @@ -2829,9 +2829,9 @@ define i1 @isnan_d_fpclass(half %x) { ; RV64IZFHMIN-LABEL: isnan_d_fpclass: ; RV64IZFHMIN: # %bb.0: ; RV64IZFHMIN-NEXT: fmv.x.h a0, fa0 +; RV64IZFHMIN-NEXT: li a1, 31 ; RV64IZFHMIN-NEXT: slli a0, a0, 49 ; RV64IZFHMIN-NEXT: srli a0, a0, 49 -; RV64IZFHMIN-NEXT: li a1, 31 ; RV64IZFHMIN-NEXT: slli a1, a1, 10 ; RV64IZFHMIN-NEXT: slt a0, a1, a0 ; RV64IZFHMIN-NEXT: ret @@ -2840,8 +2840,8 @@ define i1 @isnan_d_fpclass(half %x) { ; RV32IZHINXMIN: # %bb.0: ; RV32IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32IZHINXMIN-NEXT: slli a0, a0, 17 -; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: li a1, 31 +; RV32IZHINXMIN-NEXT: srli a0, a0, 17 ; RV32IZHINXMIN-NEXT: slli a1, a1, 10 ; RV32IZHINXMIN-NEXT: slt a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret @@ -2850,8 +2850,8 @@ define i1 @isnan_d_fpclass(half %x) { ; RV64IZHINXMIN: # %bb.0: ; RV64IZHINXMIN-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64IZHINXMIN-NEXT: slli a0, a0, 49 -; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: li a1, 31 +; RV64IZHINXMIN-NEXT: srli a0, a0, 49 ; RV64IZHINXMIN-NEXT: slli a1, a1, 10 ; RV64IZHINXMIN-NEXT: slt a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll index 5b6a94a83f94b..9ac2a4d037f8a 100644 --- a/llvm/test/CodeGen/RISCV/half-mem.ll +++ b/llvm/test/CodeGen/RISCV/half-mem.ll @@ -134,10 +134,10 @@ define half @flh_fsh_global(half %a, half %b) nounwind { ; CHECKIZFHMIN: # %bb.0: ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa1 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 +; CHECKIZFHMIN-NEXT: lui a0, %hi(G) ; CHECKIZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; CHECKIZFHMIN-NEXT: flh fa4, %lo(G)(a0) ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: lui a0, %hi(G) -; CHECKIZFHMIN-NEXT: flh fa5, %lo(G)(a0) ; CHECKIZFHMIN-NEXT: addi a1, a0, %lo(G) ; CHECKIZFHMIN-NEXT: fsh fa0, %lo(G)(a0) ; CHECKIZFHMIN-NEXT: flh fa5, 18(a1) @@ -148,14 +148,14 @@ define half @flh_fsh_global(half %a, half %b) nounwind { ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: lui a2, %hi(G) ; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: lh zero, %lo(G)(a2) ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 -; CHECKIZHINXMIN-NEXT: lui a1, %hi(G) -; CHECKIZHINXMIN-NEXT: lh zero, %lo(G)(a1) -; CHECKIZHINXMIN-NEXT: addi a2, a1, %lo(G) -; CHECKIZHINXMIN-NEXT: sh a0, %lo(G)(a1) -; CHECKIZHINXMIN-NEXT: lh zero, 18(a2) -; CHECKIZHINXMIN-NEXT: sh a0, 18(a2) +; CHECKIZHINXMIN-NEXT: addi a1, a2, %lo(G) +; CHECKIZHINXMIN-NEXT: sh a0, %lo(G)(a2) +; CHECKIZHINXMIN-NEXT: lh zero, 18(a1) +; CHECKIZHINXMIN-NEXT: sh a0, 18(a1) ; CHECKIZHINXMIN-NEXT: ret %1 = fadd half %a, %b %2 = load volatile half, ptr @G diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll index 9e1a26e74d70b..3b645bf8aef91 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll @@ -115,7 +115,7 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB1_4 ; RV32IZFH-NEXT: # %bb.3: @@ -123,19 +123,19 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB1_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB1_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB1_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB1_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -172,7 +172,7 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB1_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -180,19 +180,19 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB1_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB1_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB1_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB1_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -236,13 +236,13 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB1_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -250,19 +250,19 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB1_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB1_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB1_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB1_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -313,7 +313,7 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB1_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -321,19 +321,19 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB1_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB1_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB1_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB1_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -529,10 +529,10 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI3_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI3_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -572,11 +572,11 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -630,10 +630,10 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI3_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -687,11 +687,11 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -823,7 +823,7 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB5_4 ; RV32IZFH-NEXT: # %bb.3: @@ -831,19 +831,19 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB5_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI5_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB5_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB5_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB5_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -880,7 +880,7 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB5_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -888,19 +888,19 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB5_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB5_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB5_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB5_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -944,13 +944,13 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB5_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -958,19 +958,19 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB5_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI5_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB5_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB5_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB5_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -1021,7 +1021,7 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB5_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -1029,19 +1029,19 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB5_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB5_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB5_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB5_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1237,10 +1237,10 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI7_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI7_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1280,11 +1280,11 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1338,10 +1338,10 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI7_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI7_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1395,11 +1395,11 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1531,7 +1531,7 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB9_4 ; RV32IZFH-NEXT: # %bb.3: @@ -1539,19 +1539,19 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB9_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI9_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB9_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB9_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB9_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -1588,7 +1588,7 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB9_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -1596,19 +1596,19 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB9_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB9_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB9_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB9_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1652,13 +1652,13 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB9_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -1666,19 +1666,19 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB9_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI9_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB9_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB9_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB9_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -1729,7 +1729,7 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB9_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -1737,19 +1737,19 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB9_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB9_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB9_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB9_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1945,10 +1945,10 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI11_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI11_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1988,11 +1988,11 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2046,10 +2046,10 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI11_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI11_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2103,11 +2103,11 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2239,7 +2239,7 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB13_4 ; RV32IZFH-NEXT: # %bb.3: @@ -2247,19 +2247,19 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB13_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI13_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB13_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB13_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB13_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2296,7 +2296,7 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB13_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -2304,19 +2304,19 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB13_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB13_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB13_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB13_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2360,13 +2360,13 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB13_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -2374,19 +2374,19 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB13_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI13_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB13_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB13_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB13_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -2437,7 +2437,7 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB13_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -2445,19 +2445,19 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB13_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB13_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB13_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB13_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2653,10 +2653,10 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI15_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI15_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2696,11 +2696,11 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2754,10 +2754,10 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI15_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI15_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2811,11 +2811,11 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2947,7 +2947,7 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB17_4 ; RV32IZFH-NEXT: # %bb.3: @@ -2955,19 +2955,19 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB17_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI17_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB17_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB17_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB17_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -3004,7 +3004,7 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB17_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -3012,19 +3012,19 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB17_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB17_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB17_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB17_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -3068,13 +3068,13 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB17_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -3082,19 +3082,19 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB17_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI17_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB17_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB17_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB17_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -3145,7 +3145,7 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB17_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -3153,19 +3153,19 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB17_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB17_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB17_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB17_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -3361,10 +3361,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI19_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI19_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3404,11 +3404,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3462,10 +3462,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI19_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI19_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3519,11 +3519,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -3655,7 +3655,7 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFH-NEXT: fle.s s0, fa5, fs0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixsfdi -; RV32IZFH-NEXT: lui a4, 524288 +; RV32IZFH-NEXT: lui a3, 524288 ; RV32IZFH-NEXT: lui a2, 524288 ; RV32IZFH-NEXT: beqz s0, .LBB21_4 ; RV32IZFH-NEXT: # %bb.3: @@ -3663,19 +3663,19 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFH-NEXT: .LBB21_4: ; RV32IZFH-NEXT: lui a1, %hi(.LCPI21_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a1) -; RV32IZFH-NEXT: flt.s a3, fa5, fs0 -; RV32IZFH-NEXT: beqz a3, .LBB21_6 +; RV32IZFH-NEXT: flt.s a1, fa5, fs0 +; RV32IZFH-NEXT: beqz a1, .LBB21_6 ; RV32IZFH-NEXT: # %bb.5: -; RV32IZFH-NEXT: addi a2, a4, -1 +; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB21_6: -; RV32IZFH-NEXT: feq.s a1, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: and a1, a4, a2 -; RV32IZFH-NEXT: neg a2, s0 -; RV32IZFH-NEXT: and a0, a2, a0 -; RV32IZFH-NEXT: neg a2, a3 -; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: feq.s a3, fs0, fs0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 +; RV32IZFH-NEXT: neg a3, a3 ; RV32IZFH-NEXT: and a0, a4, a0 +; RV32IZFH-NEXT: and a1, a3, a2 +; RV32IZFH-NEXT: or a0, a5, a0 +; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -3712,7 +3712,7 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINX-NEXT: fle.s s1, a0, s0 ; RV32IZHINX-NEXT: mv a0, s0 ; RV32IZHINX-NEXT: call __fixsfdi -; RV32IZHINX-NEXT: lui a4, 524288 +; RV32IZHINX-NEXT: lui a3, 524288 ; RV32IZHINX-NEXT: lui a2, 524288 ; RV32IZHINX-NEXT: beqz s1, .LBB21_4 ; RV32IZHINX-NEXT: # %bb.3: @@ -3720,19 +3720,19 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINX-NEXT: .LBB21_4: ; RV32IZHINX-NEXT: lui a1, 389120 ; RV32IZHINX-NEXT: addi a1, a1, -1 -; RV32IZHINX-NEXT: flt.s a3, a1, s0 -; RV32IZHINX-NEXT: beqz a3, .LBB21_6 +; RV32IZHINX-NEXT: flt.s a1, a1, s0 +; RV32IZHINX-NEXT: beqz a1, .LBB21_6 ; RV32IZHINX-NEXT: # %bb.5: -; RV32IZHINX-NEXT: addi a2, a4, -1 +; RV32IZHINX-NEXT: addi a2, a3, -1 ; RV32IZHINX-NEXT: .LBB21_6: -; RV32IZHINX-NEXT: feq.s a1, s0, s0 -; RV32IZHINX-NEXT: neg a4, a1 -; RV32IZHINX-NEXT: and a1, a4, a2 -; RV32IZHINX-NEXT: neg a2, s1 -; RV32IZHINX-NEXT: and a0, a2, a0 -; RV32IZHINX-NEXT: neg a2, a3 -; RV32IZHINX-NEXT: or a0, a2, a0 +; RV32IZHINX-NEXT: feq.s a3, s0, s0 +; RV32IZHINX-NEXT: neg a4, s1 +; RV32IZHINX-NEXT: neg a5, a1 +; RV32IZHINX-NEXT: neg a3, a3 ; RV32IZHINX-NEXT: and a0, a4, a0 +; RV32IZHINX-NEXT: and a1, a3, a2 +; RV32IZHINX-NEXT: or a0, a5, a0 +; RV32IZHINX-NEXT: and a0, a3, a0 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -3776,13 +3776,13 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: lui a0, 913408 +; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5 ; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0 ; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixsfdi -; RV32IZFHMIN-NEXT: lui a4, 524288 +; RV32IZFHMIN-NEXT: lui a3, 524288 ; RV32IZFHMIN-NEXT: lui a2, 524288 ; RV32IZFHMIN-NEXT: beqz s0, .LBB21_4 ; RV32IZFHMIN-NEXT: # %bb.3: @@ -3790,19 +3790,19 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: .LBB21_4: ; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI21_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a1) -; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0 -; RV32IZFHMIN-NEXT: beqz a3, .LBB21_6 +; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 +; RV32IZFHMIN-NEXT: beqz a1, .LBB21_6 ; RV32IZFHMIN-NEXT: # %bb.5: -; RV32IZFHMIN-NEXT: addi a2, a4, -1 +; RV32IZFHMIN-NEXT: addi a2, a3, -1 ; RV32IZFHMIN-NEXT: .LBB21_6: -; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0 -; RV32IZFHMIN-NEXT: neg a4, a1 -; RV32IZFHMIN-NEXT: and a1, a4, a2 -; RV32IZFHMIN-NEXT: neg a2, s0 -; RV32IZFHMIN-NEXT: and a0, a2, a0 -; RV32IZFHMIN-NEXT: neg a2, a3 -; RV32IZFHMIN-NEXT: or a0, a2, a0 +; RV32IZFHMIN-NEXT: feq.s a3, fs0, fs0 +; RV32IZFHMIN-NEXT: neg a4, s0 +; RV32IZFHMIN-NEXT: neg a5, a1 +; RV32IZFHMIN-NEXT: neg a3, a3 ; RV32IZFHMIN-NEXT: and a0, a4, a0 +; RV32IZFHMIN-NEXT: and a1, a3, a2 +; RV32IZFHMIN-NEXT: or a0, a5, a0 +; RV32IZFHMIN-NEXT: and a0, a3, a0 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload @@ -3853,7 +3853,7 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fle.s s1, a0, s0 ; RV32IZHINXMIN-NEXT: mv a0, s0 ; RV32IZHINXMIN-NEXT: call __fixsfdi -; RV32IZHINXMIN-NEXT: lui a4, 524288 +; RV32IZHINXMIN-NEXT: lui a3, 524288 ; RV32IZHINXMIN-NEXT: lui a2, 524288 ; RV32IZHINXMIN-NEXT: beqz s1, .LBB21_4 ; RV32IZHINXMIN-NEXT: # %bb.3: @@ -3861,19 +3861,19 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: .LBB21_4: ; RV32IZHINXMIN-NEXT: lui a1, 389120 ; RV32IZHINXMIN-NEXT: addi a1, a1, -1 -; RV32IZHINXMIN-NEXT: flt.s a3, a1, s0 -; RV32IZHINXMIN-NEXT: beqz a3, .LBB21_6 +; RV32IZHINXMIN-NEXT: flt.s a1, a1, s0 +; RV32IZHINXMIN-NEXT: beqz a1, .LBB21_6 ; RV32IZHINXMIN-NEXT: # %bb.5: -; RV32IZHINXMIN-NEXT: addi a2, a4, -1 +; RV32IZHINXMIN-NEXT: addi a2, a3, -1 ; RV32IZHINXMIN-NEXT: .LBB21_6: -; RV32IZHINXMIN-NEXT: feq.s a1, s0, s0 -; RV32IZHINXMIN-NEXT: neg a4, a1 -; RV32IZHINXMIN-NEXT: and a1, a4, a2 -; RV32IZHINXMIN-NEXT: neg a2, s1 -; RV32IZHINXMIN-NEXT: and a0, a2, a0 -; RV32IZHINXMIN-NEXT: neg a2, a3 -; RV32IZHINXMIN-NEXT: or a0, a2, a0 +; RV32IZHINXMIN-NEXT: feq.s a3, s0, s0 +; RV32IZHINXMIN-NEXT: neg a4, s1 +; RV32IZHINXMIN-NEXT: neg a5, a1 +; RV32IZHINXMIN-NEXT: neg a3, a3 ; RV32IZHINXMIN-NEXT: and a0, a4, a0 +; RV32IZHINXMIN-NEXT: and a1, a3, a2 +; RV32IZHINXMIN-NEXT: or a0, a5, a0 +; RV32IZHINXMIN-NEXT: and a0, a3, a0 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -4069,10 +4069,10 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZFH-NEXT: lui a2, %hi(.LCPI23_1) ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI23_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 -; RV32IZFH-NEXT: and a1, s0, a1 ; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -4112,11 +4112,11 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZHINX-NEXT: call __fixunssfdi ; RV32IZHINX-NEXT: and a0, s1, a0 ; RV32IZHINX-NEXT: lui a2, 391168 +; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: addi a2, a2, -1 ; RV32IZHINX-NEXT: flt.s a2, a2, s0 ; RV32IZHINX-NEXT: neg a2, a2 ; RV32IZHINX-NEXT: or a0, a2, a0 -; RV32IZHINX-NEXT: and a1, s1, a1 ; RV32IZHINX-NEXT: or a1, a2, a1 ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -4170,10 +4170,10 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI23_0) ; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI23_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 -; RV32IZFHMIN-NEXT: and a1, s0, a1 ; RV32IZFHMIN-NEXT: or a1, a2, a1 ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -4227,11 +4227,11 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZHINXMIN-NEXT: call __fixunssfdi ; RV32IZHINXMIN-NEXT: and a0, s1, a0 ; RV32IZHINXMIN-NEXT: lui a2, 391168 +; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: addi a2, a2, -1 ; RV32IZHINXMIN-NEXT: flt.s a2, a2, s0 ; RV32IZHINXMIN-NEXT: neg a2, a2 ; RV32IZHINXMIN-NEXT: or a0, a2, a0 -; RV32IZHINXMIN-NEXT: and a1, s1, a1 ; RV32IZHINXMIN-NEXT: or a1, a2, a1 ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll index b793c500fc397..d92dcb9eac4c6 100644 --- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll @@ -358,8 +358,8 @@ define half @select_fcmp_ord(half %a, half %b) nounwind { ; CHECKIZHINXMIN-LABEL: select_fcmp_ord: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a1 -; CHECKIZHINXMIN-NEXT: feq.s a2, a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a3, a0 +; CHECKIZHINXMIN-NEXT: feq.s a2, a2, a2 ; CHECKIZHINXMIN-NEXT: feq.s a3, a3, a3 ; CHECKIZHINXMIN-NEXT: and a2, a3, a2 ; CHECKIZHINXMIN-NEXT: bnez a2, .LBB7_2 @@ -689,8 +689,8 @@ define half @select_fcmp_uno(half %a, half %b) nounwind { ; CHECKIZHINXMIN-LABEL: select_fcmp_uno: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a1 -; CHECKIZHINXMIN-NEXT: feq.s a2, a2, a2 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a3, a0 +; CHECKIZHINXMIN-NEXT: feq.s a2, a2, a2 ; CHECKIZHINXMIN-NEXT: feq.s a3, a3, a3 ; CHECKIZHINXMIN-NEXT: and a2, a3, a2 ; CHECKIZHINXMIN-NEXT: beqz a2, .LBB14_2 diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index a0c85ab4dca7f..66cde323ce507 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) { define i128 @abs128(i128 %x) { ; RV32I-LABEL: abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a3, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB8_2 +; RV32I-NEXT: bgez a3, .LBB8_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 ; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a3 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: snez a7, a2 ; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, t0 -; RV32I-NEXT: sub a1, a5, a6 ; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: sltu a3, a5, a6 +; RV32I-NEXT: neg a7, a1 +; RV32I-NEXT: sub a1, a5, a6 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB8_2: -; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a3, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB8_2 +; RV32ZBB-NEXT: bgez a3, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 ; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a3 -; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: snez a7, a2 ; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, t0 -; RV32ZBB-NEXT: sub a1, a5, a6 ; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: or a6, a7, a6 +; RV32ZBB-NEXT: add a1, a3, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: neg a3, a3 +; RV32ZBB-NEXT: sltu a3, a5, a6 +; RV32ZBB-NEXT: neg a7, a1 +; RV32ZBB-NEXT: sub a1, a5, a6 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB8_2: -; RV32ZBB-NEXT: sw a3, 0(a0) +; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a4, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: abs128: @@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) { define i128 @select_abs128(i128 %x) { ; RV32I-LABEL: select_abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a3, 12(a1) +; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB9_2 +; RV32I-NEXT: bgez a3, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: neg a5, a1 ; RV32I-NEXT: snez a6, a4 -; RV32I-NEXT: snez a7, a3 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: snez a7, a2 ; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, t0 -; RV32I-NEXT: sub a1, a5, a6 ; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: sltu a3, a5, a6 +; RV32I-NEXT: neg a7, a1 +; RV32I-NEXT: sub a1, a5, a6 +; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a2, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: select_abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a3, 12(a1) +; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB9_2 +; RV32ZBB-NEXT: bgez a3, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: neg a5, a1 ; RV32ZBB-NEXT: snez a6, a4 -; RV32ZBB-NEXT: snez a7, a3 -; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: snez a7, a2 ; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, t0 -; RV32ZBB-NEXT: sub a1, a5, a6 ; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: or a6, a7, a6 +; RV32ZBB-NEXT: add a1, a3, a1 ; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: neg a3, a3 +; RV32ZBB-NEXT: sltu a3, a5, a6 +; RV32ZBB-NEXT: neg a7, a1 +; RV32ZBB-NEXT: sub a1, a5, a6 +; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: sw a3, 0(a0) +; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw a4, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a3, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: select_abs128: diff --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll index 70bcb066fe4f0..830f381b659d1 100644 --- a/llvm/test/CodeGen/RISCV/imm.ll +++ b/llvm/test/CodeGen/RISCV/imm.ll @@ -888,8 +888,8 @@ define i64 @imm64_8() nounwind { ; RV32I-LABEL: imm64_8: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 633806 -; RV32I-NEXT: addi a0, a0, -272 ; RV32I-NEXT: lui a1, 74565 +; RV32I-NEXT: addi a0, a0, -272 ; RV32I-NEXT: addi a1, a1, 1656 ; RV32I-NEXT: ret ; @@ -1190,8 +1190,8 @@ define i64 @imm_right_shifted_lui_1() nounwind { ; RV32I-LABEL: imm_right_shifted_lui_1: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 1048575 -; RV32I-NEXT: addi a0, a0, 1 ; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi a0, a0, 1 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret ; @@ -1427,8 +1427,8 @@ define i64 @imm_end_2addi_1() nounwind { ; RV32I-LABEL: imm_end_2addi_1: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 1048575 -; RV32I-NEXT: addi a0, a0, 2047 ; RV32I-NEXT: lui a1, 1048512 +; RV32I-NEXT: addi a0, a0, 2047 ; RV32I-NEXT: addi a1, a1, 127 ; RV32I-NEXT: ret ; @@ -2446,8 +2446,8 @@ define i64 @imm_neg_8798043653189() { ; RV32I-LABEL: imm_neg_8798043653189: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 572348 -; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: lui a1, 1048575 +; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: addi a1, a1, 2047 ; RV32I-NEXT: ret ; @@ -2512,8 +2512,8 @@ define i64 @imm_9223372034904144827() { ; RV32I-LABEL: imm_9223372034904144827: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 572348 -; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret ; @@ -2578,8 +2578,8 @@ define i64 @imm_neg_9223354442718100411() { ; RV32I-LABEL: imm_neg_9223354442718100411: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 572348 -; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: lui a1, 524287 +; RV32I-NEXT: addi a0, a0, -1093 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret ; @@ -2895,8 +2895,8 @@ define i64 @imm_12900924131259() { ; RV32I-LABEL: imm_12900924131259: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 765952 -; RV32I-NEXT: addi a0, a0, 1979 ; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a0, a0, 1979 ; RV32I-NEXT: addi a1, a1, -1093 ; RV32I-NEXT: ret ; @@ -3017,8 +3017,8 @@ define i64 @imm_12900936431479() { ; RV32I-LABEL: imm_12900936431479: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 768955 -; RV32I-NEXT: addi a0, a0, 1911 ; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a0, a0, 1911 ; RV32I-NEXT: addi a1, a1, -1093 ; RV32I-NEXT: ret ; @@ -3089,8 +3089,8 @@ define i64 @imm_12900918536874() { ; RV32I-LABEL: imm_12900918536874: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 764587 -; RV32I-NEXT: addi a0, a0, -1366 ; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a0, a0, -1366 ; RV32I-NEXT: addi a1, a1, -1093 ; RV32I-NEXT: ret ; @@ -3161,8 +3161,8 @@ define i64 @imm_12900925247761() { ; RV32I-LABEL: imm_12900925247761: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 766225 -; RV32I-NEXT: addi a0, a0, 273 ; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a0, a0, 273 ; RV32I-NEXT: addi a1, a1, -1093 ; RV32I-NEXT: ret ; @@ -4165,8 +4165,8 @@ define i64 @imm64_0xFF7FFFFF7FFFFFFE() { ; RV32I-LABEL: imm64_0xFF7FFFFF7FFFFFFE: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, 524288 -; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: lui a1, 1046528 +; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll index b1afdded62d69..d58e6fe7675da 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll @@ -12,8 +12,8 @@ define double @constraint_f_double(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: fld fa4, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: fadd.d fa5, fa5, fa4 @@ -45,8 +45,8 @@ define double @constraint_cf_double(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: fld fa4, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: fadd.d fa5, fa5, fa4 @@ -78,8 +78,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa1, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa1, 8(sp) ; RV32F-NEXT: fld fs0, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: fadd.d ft0, fa1, fs0 diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll index 581cf8e3bf3c9..238a0fa0b6fd7 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll @@ -15,8 +15,8 @@ define double @constraint_f_double(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: fld fa4, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) @@ -48,8 +48,8 @@ define double @constraint_cf_double(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa5, 8(sp) ; RV32F-NEXT: fld fa4, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) @@ -81,8 +81,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind { ; RV32F-NEXT: addi sp, sp, -16 ; RV32F-NEXT: sw a0, 8(sp) ; RV32F-NEXT: sw a1, 12(sp) -; RV32F-NEXT: fld fa1, 8(sp) ; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa1, 8(sp) ; RV32F-NEXT: fld fs0, %lo(gd)(a0) ; RV32F-NEXT: #APP ; RV32F-NEXT: .insn 0x4, 0x02000053 | (0 << 7) | (11 << 15) | (8 << 20) diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll index fa6ac96b57b1e..973eb9f41f4fe 100644 --- a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll +++ b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll @@ -29,8 +29,8 @@ define void @foo_i32() nounwind #0 { ; CHECK-RV32-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: sw a1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(a) -; CHECK-RV32-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32-NEXT: lui a1, %hi(b) +; CHECK-RV32-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32-NEXT: add a0, a1, a0 ; CHECK-RV32-NEXT: lui a1, %hi(c) @@ -46,8 +46,8 @@ define void @foo_i32() nounwind #0 { ; CHECK-RV32IF-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: sw a1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: lui a0, %hi(a) -; CHECK-RV32IF-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IF-NEXT: lui a1, %hi(b) +; CHECK-RV32IF-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IF-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32IF-NEXT: add a0, a1, a0 ; CHECK-RV32IF-NEXT: lui a1, %hi(c) @@ -63,8 +63,8 @@ define void @foo_i32() nounwind #0 { ; CHECK-RV32IFD-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; CHECK-RV32IFD-NEXT: sw a1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32IFD-NEXT: lui a0, %hi(a) -; CHECK-RV32IFD-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IFD-NEXT: lui a1, %hi(b) +; CHECK-RV32IFD-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IFD-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32IFD-NEXT: add a0, a1, a0 ; CHECK-RV32IFD-NEXT: lui a1, %hi(c) @@ -94,8 +94,8 @@ define void @foo_fp_i32() nounwind #1 { ; CHECK-RV32-NEXT: sw a1, 0(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: addi s0, sp, 16 ; CHECK-RV32-NEXT: lui a0, %hi(a) -; CHECK-RV32-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32-NEXT: lui a1, %hi(b) +; CHECK-RV32-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32-NEXT: add a0, a1, a0 ; CHECK-RV32-NEXT: lui a1, %hi(c) @@ -116,8 +116,8 @@ define void @foo_fp_i32() nounwind #1 { ; CHECK-RV32IF-NEXT: sw a1, 0(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: addi s0, sp, 16 ; CHECK-RV32IF-NEXT: lui a0, %hi(a) -; CHECK-RV32IF-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IF-NEXT: lui a1, %hi(b) +; CHECK-RV32IF-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IF-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32IF-NEXT: add a0, a1, a0 ; CHECK-RV32IF-NEXT: lui a1, %hi(c) @@ -138,8 +138,8 @@ define void @foo_fp_i32() nounwind #1 { ; CHECK-RV32IFD-NEXT: sw a1, 0(sp) # 4-byte Folded Spill ; CHECK-RV32IFD-NEXT: addi s0, sp, 16 ; CHECK-RV32IFD-NEXT: lui a0, %hi(a) -; CHECK-RV32IFD-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IFD-NEXT: lui a1, %hi(b) +; CHECK-RV32IFD-NEXT: lw a0, %lo(a)(a0) ; CHECK-RV32IFD-NEXT: lw a1, %lo(b)(a1) ; CHECK-RV32IFD-NEXT: add a0, a1, a0 ; CHECK-RV32IFD-NEXT: lui a1, %hi(c) @@ -182,8 +182,8 @@ define void @foo_float() nounwind #0 { ; CHECK-RV32-NEXT: sw t5, 4(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: sw t6, 0(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(e) -; CHECK-RV32-NEXT: lw a0, %lo(e)(a0) ; CHECK-RV32-NEXT: lui a1, %hi(f) +; CHECK-RV32-NEXT: lw a0, %lo(e)(a0) ; CHECK-RV32-NEXT: lw a1, %lo(f)(a1) ; CHECK-RV32-NEXT: call __addsf3 ; CHECK-RV32-NEXT: lui a1, %hi(d) @@ -277,8 +277,8 @@ define void @foo_fp_float() nounwind #1 { ; CHECK-RV32-NEXT: sw t6, 12(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: addi s0, sp, 80 ; CHECK-RV32-NEXT: lui a0, %hi(e) -; CHECK-RV32-NEXT: lw a0, %lo(e)(a0) ; CHECK-RV32-NEXT: lui a1, %hi(f) +; CHECK-RV32-NEXT: lw a0, %lo(e)(a0) ; CHECK-RV32-NEXT: lw a1, %lo(f)(a1) ; CHECK-RV32-NEXT: call __addsf3 ; CHECK-RV32-NEXT: lui a1, %hi(d) @@ -382,9 +382,9 @@ define void @foo_double() nounwind #0 { ; CHECK-RV32-NEXT: sw t5, 4(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: sw t6, 0(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, %hi(h) +; CHECK-RV32-NEXT: lui a3, %hi(i) ; CHECK-RV32-NEXT: lw a0, %lo(h)(a1) ; CHECK-RV32-NEXT: lw a1, %lo(h+4)(a1) -; CHECK-RV32-NEXT: lui a3, %hi(i) ; CHECK-RV32-NEXT: lw a2, %lo(i)(a3) ; CHECK-RV32-NEXT: lw a3, %lo(i+4)(a3) ; CHECK-RV32-NEXT: call __adddf3 @@ -450,9 +450,9 @@ define void @foo_double() nounwind #0 { ; CHECK-RV32IF-NEXT: fsw ft10, 4(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: fsw ft11, 0(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: lui a1, %hi(h) +; CHECK-RV32IF-NEXT: lui a3, %hi(i) ; CHECK-RV32IF-NEXT: lw a0, %lo(h)(a1) ; CHECK-RV32IF-NEXT: lw a1, %lo(h+4)(a1) -; CHECK-RV32IF-NEXT: lui a3, %hi(i) ; CHECK-RV32IF-NEXT: lw a2, %lo(i)(a3) ; CHECK-RV32IF-NEXT: lw a3, %lo(i+4)(a3) ; CHECK-RV32IF-NEXT: call __adddf3 @@ -549,9 +549,9 @@ define void @foo_fp_double() nounwind #1 { ; CHECK-RV32-NEXT: sw t6, 12(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: addi s0, sp, 80 ; CHECK-RV32-NEXT: lui a1, %hi(h) +; CHECK-RV32-NEXT: lui a3, %hi(i) ; CHECK-RV32-NEXT: lw a0, %lo(h)(a1) ; CHECK-RV32-NEXT: lw a1, %lo(h+4)(a1) -; CHECK-RV32-NEXT: lui a3, %hi(i) ; CHECK-RV32-NEXT: lw a2, %lo(i)(a3) ; CHECK-RV32-NEXT: lw a3, %lo(i+4)(a3) ; CHECK-RV32-NEXT: call __adddf3 @@ -620,9 +620,9 @@ define void @foo_fp_double() nounwind #1 { ; CHECK-RV32IF-NEXT: fsw ft11, 12(sp) # 4-byte Folded Spill ; CHECK-RV32IF-NEXT: addi s0, sp, 160 ; CHECK-RV32IF-NEXT: lui a1, %hi(h) +; CHECK-RV32IF-NEXT: lui a3, %hi(i) ; CHECK-RV32IF-NEXT: lw a0, %lo(h)(a1) ; CHECK-RV32IF-NEXT: lw a1, %lo(h+4)(a1) -; CHECK-RV32IF-NEXT: lui a3, %hi(i) ; CHECK-RV32IF-NEXT: lw a2, %lo(i)(a3) ; CHECK-RV32IF-NEXT: lw a3, %lo(i+4)(a3) ; CHECK-RV32IF-NEXT: call __adddf3 diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index 19f40dddeaec2..111b3e2bf82ce 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -8,17 +8,17 @@ define i32 @ctz_nxv4i32( %a) #0 { ; RV32-LABEL: ctz_nxv4i32: ; RV32: # %bb.0: ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vid.v v11 +; RV32-NEXT: vid.v v10 ; RV32-NEXT: li a1, -1 -; RV32-NEXT: vmadd.vx v11, a1, v10 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vmadd.vx v10, a1, v8 ; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vvm v8, v8, v11, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sub a0, a0, a1 @@ -29,17 +29,17 @@ define i32 @ctz_nxv4i32( %a) #0 { ; RV64-LABEL: ctz_nxv4i32: ; RV64: # %bb.0: ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vid.v v11 +; RV64-NEXT: vid.v v10 ; RV64-NEXT: li a1, -1 -; RV64-NEXT: vmadd.vx v11, a1, v10 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vmadd.vx v10, a1, v8 ; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v11, v0 +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: subw a0, a0, a1 @@ -75,28 +75,28 @@ define i64 @ctz_nxv8i1_no_range( %a) { ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: vid.v v16 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vid.v v8 ; RV32-NEXT: li a2, -1 -; RV32-NEXT: vmadd.vx v16, a2, v8 -; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vl2r.v v24, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: vmsne.vi v0, v24, 0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vmadd.vx v8, a2, v16 +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vmerge.vim v16, v16, -1, v0 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sltu a3, a0, a2 -; RV32-NEXT: li a4, 32 +; RV32-NEXT: vmv.x.s a3, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a4 +; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: sltu a2, a0, a3 ; RV32-NEXT: vmv.x.s a4, v8 ; RV32-NEXT: sub a1, a1, a4 -; RV32-NEXT: sub a1, a1, a3 -; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: sub a0, a0, a3 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 1 ; RV32-NEXT: add sp, sp, a2 @@ -111,15 +111,15 @@ define i64 @ctz_nxv8i1_no_range( %a) { ; RV64: # %bb.0: ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.v.x v16, a0 -; RV64-NEXT: vid.v v24 +; RV64-NEXT: vid.v v16 ; RV64-NEXT: li a1, -1 -; RV64-NEXT: vmadd.vx v24, a1, v16 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vmadd.vx v16, a1, v8 ; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sub a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll index c7ba0e501fa44..4a338ce5bd1f7 100644 --- a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll +++ b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll @@ -25,8 +25,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind { ; RV32I-LABEL: shifts_necmp_i16_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: srli a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srli a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 8 ; RV32I-NEXT: srli a0, a0, 16 ; RV32I-NEXT: xor a0, a0, a1 @@ -36,8 +36,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind { ; RV64I-LABEL: shifts_necmp_i16_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 48 -; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 8 ; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: xor a0, a0, a1 @@ -638,10 +638,10 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, 128 ; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sltiu a2, a2, 256 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: snez a0, a0 -; RV32I-NEXT: sltiu a1, a2, 256 -; RV32I-NEXT: xori a1, a1, 1 +; RV32I-NEXT: xori a1, a2, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; @@ -754,9 +754,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { ; RV32I-LABEL: add_ugecmp_bad_i16_i8_cmp: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a0, a0, 128 ; RV32I-NEXT: addi a2, a2, -1 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: addi a0, a0, 128 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: sltu a0, a0, a1 ; RV32I-NEXT: xori a0, a0, 1 @@ -765,9 +765,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { ; RV64I-LABEL: add_ugecmp_bad_i16_i8_cmp: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addi a0, a0, 128 ; RV64I-NEXT: addiw a2, a2, -1 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: addi a0, a0, 128 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: sltu a0, a0, a1 ; RV64I-NEXT: xori a0, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll index a1f8bd4ab12be..15a123400fd4f 100644 --- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll +++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll @@ -187,12 +187,12 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; RV32IFD-NEXT: .cfi_offset fs0, -24 ; RV32IFD-NEXT: .cfi_offset fs1, -32 ; RV32IFD-NEXT: .cfi_offset fs2, -40 -; RV32IFD-NEXT: lhu a2, 8(a1) -; RV32IFD-NEXT: lhu a3, 0(a1) -; RV32IFD-NEXT: lhu a1, 4(a1) ; RV32IFD-NEXT: mv s0, a0 -; RV32IFD-NEXT: fmv.w.x fs0, a2 -; RV32IFD-NEXT: fmv.w.x fs1, a3 +; RV32IFD-NEXT: lhu a0, 8(a1) +; RV32IFD-NEXT: lhu a2, 0(a1) +; RV32IFD-NEXT: lhu a1, 4(a1) +; RV32IFD-NEXT: fmv.w.x fs0, a0 +; RV32IFD-NEXT: fmv.w.x fs1, a2 ; RV32IFD-NEXT: fmv.w.x fa0, a1 ; RV32IFD-NEXT: call __extendhfsf2 ; RV32IFD-NEXT: call exp10f diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll index e85a7118f5ff8..74dec76a02e89 100644 --- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll +++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll @@ -61,8 +61,8 @@ define { half, i32 } @test_frexp_f16_i32(half %a) nounwind { ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: call frexpf ; RV32IZFINXZDINX-NEXT: call __truncsfhf2 -; RV32IZFINXZDINX-NEXT: lw a1, 8(sp) ; RV32IZFINXZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 +; RV32IZFINXZDINX-NEXT: lw a1, 8(sp) ; RV32IZFINXZDINX-NEXT: lui a2, 1048560 ; RV32IZFINXZDINX-NEXT: or a0, a0, a2 ; RV32IZFINXZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 @@ -78,8 +78,8 @@ define { half, i32 } @test_frexp_f16_i32(half %a) nounwind { ; RV64IZFINXZDINX-NEXT: mv a1, sp ; RV64IZFINXZDINX-NEXT: call frexpf ; RV64IZFINXZDINX-NEXT: call __truncsfhf2 -; RV64IZFINXZDINX-NEXT: ld a1, 0(sp) ; RV64IZFINXZDINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 +; RV64IZFINXZDINX-NEXT: ld a1, 0(sp) ; RV64IZFINXZDINX-NEXT: lui a2, 1048560 ; RV64IZFINXZDINX-NEXT: or a0, a0, a2 ; RV64IZFINXZDINX-NEXT: # kill: def $x10_w killed $x10_w killed $x10 diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll index 8693283e83712..43719a452c236 100644 --- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll +++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll @@ -32,19 +32,19 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind { ; RV32: # %bb.0: # %entry ; RV32-NEXT: blez a1, .LBB0_3 ; RV32-NEXT: # %bb.1: # %cond_true.preheader -; RV32-NEXT: slli a0, a0, 6 -; RV32-NEXT: lui a2, %hi(A) -; RV32-NEXT: addi a2, a2, %lo(A) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: li a2, 4 +; RV32-NEXT: slli a2, a0, 6 +; RV32-NEXT: lui a3, %hi(A) +; RV32-NEXT: addi a3, a3, %lo(A) +; RV32-NEXT: li a0, 4 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: addi a2, a2, 8 ; RV32-NEXT: li a3, 5 ; RV32-NEXT: .LBB0_2: # %cond_true ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: sw a2, -4(a0) -; RV32-NEXT: sw a3, 0(a0) +; RV32-NEXT: sw a0, -4(a2) +; RV32-NEXT: sw a3, 0(a2) ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: addi a0, a0, 4 +; RV32-NEXT: addi a2, a2, 4 ; RV32-NEXT: bnez a1, .LBB0_2 ; RV32-NEXT: .LBB0_3: # %return ; RV32-NEXT: ret @@ -53,24 +53,24 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind { ; RV64: # %bb.0: # %entry ; RV64-NEXT: blez a1, .LBB0_3 ; RV64-NEXT: # %bb.1: # %cond_true.preheader -; RV64-NEXT: slli a0, a0, 6 -; RV64-NEXT: lui a2, %hi(A) -; RV64-NEXT: addi a2, a2, %lo(A) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: addi a2, a0, 4 +; RV64-NEXT: slli a3, a0, 6 +; RV64-NEXT: lui a4, %hi(A) +; RV64-NEXT: addi a4, a4, %lo(A) ; RV64-NEXT: addiw a1, a1, 2 -; RV64-NEXT: li a3, 2 -; RV64-NEXT: li a4, 4 +; RV64-NEXT: li a0, 2 +; RV64-NEXT: li a2, 4 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: addi a4, a3, 4 ; RV64-NEXT: li a5, 5 ; RV64-NEXT: .LBB0_2: # %cond_true ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NEXT: sw a4, 0(a2) -; RV64-NEXT: slli a6, a3, 2 -; RV64-NEXT: add a6, a0, a6 +; RV64-NEXT: sw a2, 0(a4) +; RV64-NEXT: slli a6, a0, 2 +; RV64-NEXT: addiw a0, a0, 1 +; RV64-NEXT: add a6, a3, a6 ; RV64-NEXT: sw a5, 0(a6) -; RV64-NEXT: addiw a3, a3, 1 -; RV64-NEXT: addi a2, a2, 4 -; RV64-NEXT: bne a3, a1, .LBB0_2 +; RV64-NEXT: addi a4, a4, 4 +; RV64-NEXT: bne a0, a1, .LBB0_2 ; RV64-NEXT: .LBB0_3: # %return ; RV64-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll index 2c8839683d816..b2ccbd821eb8e 100644 --- a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll +++ b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll @@ -20,10 +20,10 @@ define i32 @main() nounwind { ; RV32I-NEXT: .LBB0_1: # %for.body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: addi a4, a0, -2048 -; RV32I-NEXT: sw a4, 0(a2) ; RV32I-NEXT: sw a0, 0(a1) ; RV32I-NEXT: addi a0, a0, 1 ; RV32I-NEXT: addi a1, a1, 4 +; RV32I-NEXT: sw a4, 0(a2) ; RV32I-NEXT: addi a2, a2, 4 ; RV32I-NEXT: bne a0, a3, .LBB0_1 ; RV32I-NEXT: # %bb.2: # %for.end diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll index ebf232cc458ba..a18f5d6902dca 100644 --- a/llvm/test/CodeGen/RISCV/machine-combiner.ll +++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll @@ -10,9 +10,9 @@ define double @test_reassoc_fadd1(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fadd1: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, fa5, fa4 +; CHECK-NEXT: fadd.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %t0, %a2 @@ -23,9 +23,9 @@ define double @test_reassoc_fadd1(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fadd2(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fadd2: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -36,9 +36,9 @@ define double @test_reassoc_fadd2(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fadd3(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fadd3: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa3, fa2 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %t0, %a2 @@ -49,9 +49,9 @@ define double @test_reassoc_fadd3(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fadd4(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fadd4: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa3, fa2 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -62,9 +62,9 @@ define double @test_reassoc_fadd4(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fmul1(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fmul1: ; CHECK: # %bb.0: -; CHECK-NEXT: fmul.d fa5, fa0, fa1 -; CHECK-NEXT: fmul.d fa4, fa2, fa3 -; CHECK-NEXT: fmul.d fa0, fa5, fa4 +; CHECK-NEXT: fmul.d fa5, fa2, fa3 +; CHECK-NEXT: fmul.d fa4, fa0, fa1 +; CHECK-NEXT: fmul.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %t0, %a2 @@ -75,9 +75,9 @@ define double @test_reassoc_fmul1(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fmul2(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fmul2: ; CHECK: # %bb.0: -; CHECK-NEXT: fmul.d fa5, fa0, fa1 -; CHECK-NEXT: fmul.d fa4, fa2, fa3 -; CHECK-NEXT: fmul.d fa0, fa4, fa5 +; CHECK-NEXT: fmul.d fa5, fa2, fa3 +; CHECK-NEXT: fmul.d fa4, fa0, fa1 +; CHECK-NEXT: fmul.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %a2, %t0 @@ -88,9 +88,9 @@ define double @test_reassoc_fmul2(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fmul3(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fmul3: ; CHECK: # %bb.0: -; CHECK-NEXT: fmul.d fa5, fa0, fa1 -; CHECK-NEXT: fmul.d fa4, fa3, fa2 -; CHECK-NEXT: fmul.d fa0, fa4, fa5 +; CHECK-NEXT: fmul.d fa5, fa3, fa2 +; CHECK-NEXT: fmul.d fa4, fa0, fa1 +; CHECK-NEXT: fmul.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %t0, %a2 @@ -101,9 +101,9 @@ define double @test_reassoc_fmul3(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fmul4(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fmul4: ; CHECK: # %bb.0: -; CHECK-NEXT: fmul.d fa5, fa0, fa1 -; CHECK-NEXT: fmul.d fa4, fa3, fa2 -; CHECK-NEXT: fmul.d fa0, fa4, fa5 +; CHECK-NEXT: fmul.d fa5, fa3, fa2 +; CHECK-NEXT: fmul.d fa4, fa0, fa1 +; CHECK-NEXT: fmul.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %a2, %t0 @@ -114,11 +114,11 @@ define double @test_reassoc_fmul4(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_big1(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6) { ; CHECK-LABEL: test_reassoc_big1: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa1, fa0, fa1 -; CHECK-NEXT: fadd.d fa3, fa2, fa3 ; CHECK-NEXT: fadd.d fa5, fa4, fa5 -; CHECK-NEXT: fadd.d fa4, fa1, fa3 +; CHECK-NEXT: fadd.d fa4, fa2, fa3 +; CHECK-NEXT: fadd.d fa3, fa0, fa1 ; CHECK-NEXT: fadd.d fa5, fa5, fa6 +; CHECK-NEXT: fadd.d fa4, fa3, fa4 ; CHECK-NEXT: fadd.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 @@ -133,18 +133,18 @@ define double @test_reassoc_big1(double %a0, double %a1, double %a2, double %a3, define double @test_reassoc_big2(double %a0, double %a1, i32 %a2, double %a3, i32 %a4, double %a5) { ; CHECK-LABEL: test_reassoc_big2: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 ; CHECK-NEXT: fsub.d fa4, fa3, fa2 -; CHECK-NEXT: fadd.d fa3, fa2, fa1 -; CHECK-NEXT: fcvt.d.w ft0, a0 -; CHECK-NEXT: fcvt.d.w ft1, a1 -; CHECK-NEXT: fmul.d fa2, fa2, ft0 -; CHECK-NEXT: fmul.d fa1, ft1, fa1 -; CHECK-NEXT: fsub.d fa5, fa4, fa5 -; CHECK-NEXT: fmul.d fa4, fa0, fa3 -; CHECK-NEXT: fmul.d fa3, fa1, fa2 -; CHECK-NEXT: fmul.d fa5, fa5, fa4 -; CHECK-NEXT: fmul.d fa0, fa5, fa3 +; CHECK-NEXT: fadd.d fa3, fa0, fa1 +; CHECK-NEXT: fadd.d ft0, fa2, fa1 +; CHECK-NEXT: fcvt.d.w fa5, a1 +; CHECK-NEXT: fcvt.d.w ft1, a0 +; CHECK-NEXT: fmul.d fa5, fa5, fa1 +; CHECK-NEXT: fmul.d fa2, fa2, ft1 +; CHECK-NEXT: fsub.d fa4, fa4, fa3 +; CHECK-NEXT: fmul.d fa3, fa0, ft0 +; CHECK-NEXT: fmul.d fa5, fa5, fa2 +; CHECK-NEXT: fmul.d fa4, fa4, fa3 +; CHECK-NEXT: fmul.d fa0, fa4, fa5 ; CHECK-NEXT: ret %cvt1 = sitofp i32 %a2 to double %cvt2 = sitofp i32 %a4 to double @@ -245,9 +245,9 @@ define double @test_fnmsub(double %a0, double %a1, double %a2) { define double @test_reassoc_fsub1(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub1: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, fa5, fa4 +; CHECK-NEXT: fsub.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %t0, %a2 @@ -258,9 +258,9 @@ define double @test_reassoc_fsub1(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub2(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub2: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa2, fa3 -; CHECK-NEXT: fsub.d fa0, fa5, fa4 +; CHECK-NEXT: fsub.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %t0, %a2 @@ -271,9 +271,9 @@ define double @test_reassoc_fsub2(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub3(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub3: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa2, fa3 -; CHECK-NEXT: fsub.d fa0, fa5, fa4 +; CHECK-NEXT: fadd.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %t0, %a2 @@ -284,9 +284,9 @@ define double @test_reassoc_fsub3(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub4(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub4: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -297,9 +297,9 @@ define double @test_reassoc_fsub4(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub5(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub5: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa2, fa3 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %a2, %t0 @@ -310,9 +310,9 @@ define double @test_reassoc_fsub5(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub6(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub6: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa2, fa3 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa2, fa3 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %a2, %t0 @@ -323,9 +323,9 @@ define double @test_reassoc_fsub6(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub7(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub7: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa3, fa2 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %t0, %a2 @@ -336,9 +336,9 @@ define double @test_reassoc_fsub7(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub8(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub8: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa3, fa2 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %t0, %a2 @@ -349,9 +349,9 @@ define double @test_reassoc_fsub8(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub9(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub9: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa3, fa2 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %t0, %a2 @@ -362,9 +362,9 @@ define double @test_reassoc_fsub9(double %a0, double %a1, double %a2, double %a3 define double @test_reassoc_fsub10(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub10: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa3, fa2 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -375,9 +375,9 @@ define double @test_reassoc_fsub10(double %a0, double %a1, double %a2, double %a define double @test_reassoc_fsub11(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub11: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fadd.d fa4, fa3, fa2 -; CHECK-NEXT: fsub.d fa0, fa4, fa5 +; CHECK-NEXT: fadd.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fsub.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %a2, %t0 @@ -388,9 +388,9 @@ define double @test_reassoc_fsub11(double %a0, double %a1, double %a2, double %a define double @test_reassoc_fsub12(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_reassoc_fsub12: ; CHECK: # %bb.0: -; CHECK-NEXT: fadd.d fa5, fa0, fa1 -; CHECK-NEXT: fsub.d fa4, fa3, fa2 -; CHECK-NEXT: fadd.d fa0, fa4, fa5 +; CHECK-NEXT: fsub.d fa5, fa3, fa2 +; CHECK-NEXT: fadd.d fa4, fa0, fa1 +; CHECK-NEXT: fadd.d fa0, fa5, fa4 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fsub nsz reassoc double %a2, %t0 @@ -687,9 +687,9 @@ define i64 @test_reassoc_xor_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { define i8 @test_reassoc_mul_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) { ; CHECK-LABEL: test_reassoc_mul_i8: ; CHECK: # %bb.0: +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: mul a1, a2, a3 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: mul a0, a0, a2 ; CHECK-NEXT: ret %t0 = mul i8 %a0, %a1 %t1 = mul i8 %t0, %a2 @@ -700,9 +700,9 @@ define i8 @test_reassoc_mul_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) { define i16 @test_reassoc_mul_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) { ; CHECK-LABEL: test_reassoc_mul_i16: ; CHECK: # %bb.0: +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: mul a1, a2, a3 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: mul a0, a0, a2 ; CHECK-NEXT: ret %t0 = mul i16 %a0, %a1 %t1 = mul i16 %t0, %a2 @@ -713,9 +713,9 @@ define i16 @test_reassoc_mul_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) { define i32 @test_reassoc_mul_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: test_reassoc_mul_i32: ; CHECK: # %bb.0: +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: mul a1, a2, a3 -; CHECK-NEXT: mulw a0, a0, a1 +; CHECK-NEXT: mulw a0, a0, a2 ; CHECK-NEXT: ret %t0 = mul i32 %a0, %a1 %t1 = mul i32 %t0, %a2 @@ -726,9 +726,9 @@ define i32 @test_reassoc_mul_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { define i64 @test_reassoc_mul_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { ; CHECK-LABEL: test_reassoc_mul_i64: ; CHECK: # %bb.0: +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: mul a1, a2, a3 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: mul a0, a0, a2 ; CHECK-NEXT: ret %t0 = mul i64 %a0, %a1 %t1 = mul i64 %t0, %a2 @@ -995,9 +995,9 @@ define i64 @test_reassoc_max_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { define half @test_fmin_f16(half %a0, half %a1, half %a2, half %a3) { ; CHECK-LABEL: test_fmin_f16: ; CHECK: # %bb.0: -; CHECK-NEXT: fmin.h fa5, fa0, fa1 -; CHECK-NEXT: fmin.h fa4, fa2, fa3 -; CHECK-NEXT: fmin.h fa0, fa5, fa4 +; CHECK-NEXT: fmin.h fa5, fa2, fa3 +; CHECK-NEXT: fmin.h fa4, fa0, fa1 +; CHECK-NEXT: fmin.h fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call half @llvm.minnum.f16(half %a0, half %a1) %t1 = call half @llvm.minnum.f16(half %t0, half %a2) @@ -1008,9 +1008,9 @@ define half @test_fmin_f16(half %a0, half %a1, half %a2, half %a3) { define float @test_fmin_f32(float %a0, float %a1, float %a2, float %a3) { ; CHECK-LABEL: test_fmin_f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmin.s fa5, fa0, fa1 -; CHECK-NEXT: fmin.s fa4, fa2, fa3 -; CHECK-NEXT: fmin.s fa0, fa5, fa4 +; CHECK-NEXT: fmin.s fa5, fa2, fa3 +; CHECK-NEXT: fmin.s fa4, fa0, fa1 +; CHECK-NEXT: fmin.s fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call float @llvm.minnum.f32(float %a0, float %a1) %t1 = call float @llvm.minnum.f32(float %t0, float %a2) @@ -1021,9 +1021,9 @@ define float @test_fmin_f32(float %a0, float %a1, float %a2, float %a3) { define double @test_fmin_f64(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_fmin_f64: ; CHECK: # %bb.0: -; CHECK-NEXT: fmin.d fa5, fa0, fa1 -; CHECK-NEXT: fmin.d fa4, fa2, fa3 -; CHECK-NEXT: fmin.d fa0, fa5, fa4 +; CHECK-NEXT: fmin.d fa5, fa2, fa3 +; CHECK-NEXT: fmin.d fa4, fa0, fa1 +; CHECK-NEXT: fmin.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call double @llvm.minnum.f64(double %a0, double %a1) %t1 = call double @llvm.minnum.f64(double %t0, double %a2) @@ -1034,9 +1034,9 @@ define double @test_fmin_f64(double %a0, double %a1, double %a2, double %a3) { define half @test_fmax_f16(half %a0, half %a1, half %a2, half %a3) { ; CHECK-LABEL: test_fmax_f16: ; CHECK: # %bb.0: -; CHECK-NEXT: fmax.h fa5, fa0, fa1 -; CHECK-NEXT: fmax.h fa4, fa2, fa3 -; CHECK-NEXT: fmax.h fa0, fa5, fa4 +; CHECK-NEXT: fmax.h fa5, fa2, fa3 +; CHECK-NEXT: fmax.h fa4, fa0, fa1 +; CHECK-NEXT: fmax.h fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call half @llvm.maxnum.f16(half %a0, half %a1) %t1 = call half @llvm.maxnum.f16(half %t0, half %a2) @@ -1047,9 +1047,9 @@ define half @test_fmax_f16(half %a0, half %a1, half %a2, half %a3) { define float @test_fmax_f32(float %a0, float %a1, float %a2, float %a3) { ; CHECK-LABEL: test_fmax_f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmax.s fa5, fa0, fa1 -; CHECK-NEXT: fmax.s fa4, fa2, fa3 -; CHECK-NEXT: fmax.s fa0, fa5, fa4 +; CHECK-NEXT: fmax.s fa5, fa2, fa3 +; CHECK-NEXT: fmax.s fa4, fa0, fa1 +; CHECK-NEXT: fmax.s fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call float @llvm.maxnum.f32(float %a0, float %a1) %t1 = call float @llvm.maxnum.f32(float %t0, float %a2) @@ -1060,9 +1060,9 @@ define float @test_fmax_f32(float %a0, float %a1, float %a2, float %a3) { define double @test_fmax_f64(double %a0, double %a1, double %a2, double %a3) { ; CHECK-LABEL: test_fmax_f64: ; CHECK: # %bb.0: -; CHECK-NEXT: fmax.d fa5, fa0, fa1 -; CHECK-NEXT: fmax.d fa4, fa2, fa3 -; CHECK-NEXT: fmax.d fa0, fa5, fa4 +; CHECK-NEXT: fmax.d fa5, fa2, fa3 +; CHECK-NEXT: fmax.d fa4, fa0, fa1 +; CHECK-NEXT: fmax.d fa0, fa4, fa5 ; CHECK-NEXT: ret %t0 = call double @llvm.maxnum.f64(double %a0, double %a1) %t1 = call double @llvm.maxnum.f64(double %t0, double %a2) diff --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll index 27297c9787183..af8105644b57d 100644 --- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll +++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll @@ -97,10 +97,10 @@ ret: define void @test_la_tls_ie(i32 signext %n) { ; RV32I-LABEL: test_la_tls_ie: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: .Lpcrel_hi2: -; RV32I-NEXT: auipc a1, %tls_ie_pcrel_hi(ie) -; RV32I-NEXT: lw a2, %pcrel_lo(.Lpcrel_hi2)(a1) ; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: .Lpcrel_hi2: +; RV32I-NEXT: auipc a2, %tls_ie_pcrel_hi(ie) +; RV32I-NEXT: lw a2, %pcrel_lo(.Lpcrel_hi2)(a2) ; RV32I-NEXT: add a2, a2, tp ; RV32I-NEXT: .LBB2_1: # %loop ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 @@ -112,10 +112,10 @@ define void @test_la_tls_ie(i32 signext %n) { ; ; RV64I-LABEL: test_la_tls_ie: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: .Lpcrel_hi2: -; RV64I-NEXT: auipc a1, %tls_ie_pcrel_hi(ie) -; RV64I-NEXT: ld a2, %pcrel_lo(.Lpcrel_hi2)(a1) ; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: .Lpcrel_hi2: +; RV64I-NEXT: auipc a2, %tls_ie_pcrel_hi(ie) +; RV64I-NEXT: ld a2, %pcrel_lo(.Lpcrel_hi2)(a2) ; RV64I-NEXT: add a2, a2, tp ; RV64I-NEXT: .LBB2_1: # %loop ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll index e30bdfb939471..83e9bf661ab1c 100644 --- a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll +++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll @@ -13,10 +13,10 @@ define i32 @test(ptr %a, i64 %n) { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v9, (a0) ; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: vredsum.vs v9, v9, v8 ; CHECK-NEXT: vmv.x.s a3, v9 ; CHECK-NEXT: addw a3, a3, a3 -; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, 8 ; CHECK-NEXT: bnez a1, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll index b45365e7a8b63..c2882fd46c17d 100644 --- a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll +++ b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll @@ -18,16 +18,16 @@ define void @foo(i32 signext %0, i32 signext %1) { ; ; FUSION-LABEL: foo: ; FUSION: # %bb.0: -; FUSION-NEXT: fcvt.s.w fa0, a1 ; FUSION-NEXT: lui a0, %hi(.L.str) ; FUSION-NEXT: addi a0, a0, %lo(.L.str) +; FUSION-NEXT: fcvt.s.w fa0, a1 ; FUSION-NEXT: tail bar ; ; FUSION-POSTRA-LABEL: foo: ; FUSION-POSTRA: # %bb.0: -; FUSION-POSTRA-NEXT: fcvt.s.w fa0, a1 ; FUSION-POSTRA-NEXT: lui a0, %hi(.L.str) ; FUSION-POSTRA-NEXT: addi a0, a0, %lo(.L.str) +; FUSION-POSTRA-NEXT: fcvt.s.w fa0, a1 ; FUSION-POSTRA-NEXT: tail bar %3 = sitofp i32 %1 to float tail call void @bar(ptr @.str, float %3) diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index e852579c724f8..d529ae6ecd0ab 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2283,8 +2283,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a1, a1, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret @@ -2294,8 +2294,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -2305,8 +2305,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a1, a1, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret @@ -2316,8 +2316,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret @@ -2452,8 +2452,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -2472,8 +2472,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -2492,8 +2492,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -2512,8 +2512,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -2668,8 +2668,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 @@ -2692,8 +2692,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 @@ -2848,8 +2848,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -2886,8 +2886,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -3037,8 +3037,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: @@ -3055,16 +3055,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: @@ -3087,8 +3087,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: @@ -3105,16 +3105,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: @@ -3278,16 +3278,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: @@ -3326,16 +3326,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: @@ -4449,25 +4449,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret @@ -4475,25 +4475,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret @@ -4501,25 +4501,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret @@ -4527,25 +4527,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -4556,16 +4556,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a1, 3(a1) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 0(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 1(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 2(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 1(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 2(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 0(a0) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a1, a4, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a2, a3 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a6, a7 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a0, a5 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a7, a0 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a5, a6 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret @@ -4576,20 +4576,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 0(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 1(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a2, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a5, a5, a6 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 24 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 2(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a3, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a5, a5, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a5 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret @@ -4597,25 +4597,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret @@ -4623,25 +4623,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret @@ -4784,8 +4784,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -4804,8 +4804,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret @@ -4962,8 +4962,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -4982,8 +4982,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index f0c14ccb0d5f2..860c3a94abc0a 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -1410,20 +1410,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw a0, 27(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-NEXT: lw a1, 27(a1) -; CHECK-UNALIGNED-RV32-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-NEXT: or a4, a4, a5 @@ -1466,20 +1466,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 27(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 27(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a4, a4, a5 @@ -1522,20 +1522,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 27(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 27(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: or a4, a4, a5 @@ -1578,20 +1578,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-V-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw a0, 27(a0) ; CHECK-UNALIGNED-RV32-V-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-V-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw a0, 27(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: lw a1, 27(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-V-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-V-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-V-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-V-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-V-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-V-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-V-NEXT: or a4, a4, a5 @@ -1719,20 +1719,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw a0, 28(a0) -; CHECK-UNALIGNED-RV32-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-NEXT: lw a1, 28(a1) -; CHECK-UNALIGNED-RV32-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-NEXT: or a4, a4, a5 @@ -1775,20 +1775,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a0, 28(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lw a1, 28(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a4, a4, a5 @@ -1831,20 +1831,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a0, 28(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lw a1, 28(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: or a4, a4, a5 @@ -1887,20 +1887,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 8(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 12(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV32-V-NEXT: lw a6, 16(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t2, 20(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t3, 24(a0) +; CHECK-UNALIGNED-RV32-V-NEXT: lw a0, 28(a0) ; CHECK-UNALIGNED-RV32-V-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV32-V-NEXT: lw a6, 16(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw a7, 20(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 24(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw a0, 28(a0) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 16(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t2, 20(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: lw t3, 24(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw a7, 16(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t0, 20(a1) +; CHECK-UNALIGNED-RV32-V-NEXT: lw t1, 24(a1) ; CHECK-UNALIGNED-RV32-V-NEXT: lw a1, 28(a1) -; CHECK-UNALIGNED-RV32-V-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV32-V-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV32-V-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV32-V-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV32-V-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV32-V-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV32-V-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV32-V-NEXT: or a4, a4, a5 @@ -1998,20 +1998,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld a0, 55(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-NEXT: ld a1, 55(a1) -; CHECK-UNALIGNED-RV64-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-NEXT: or a4, a4, a5 @@ -2034,20 +2034,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 55(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 55(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a4, a4, a5 @@ -2070,20 +2070,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 55(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 55(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a4, a4, a5 @@ -2106,20 +2106,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-V-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld a0, 55(a0) ; CHECK-UNALIGNED-RV64-V-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-V-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld a0, 55(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: ld a1, 55(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-V-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-V-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-V-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-V-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-V-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-V-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-V-NEXT: or a4, a4, a5 @@ -2197,20 +2197,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld a0, 56(a0) -; CHECK-UNALIGNED-RV64-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-NEXT: ld a1, 56(a1) -; CHECK-UNALIGNED-RV64-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-NEXT: or a4, a4, a5 @@ -2233,20 +2233,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a0, 56(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ld a1, 56(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a4, a4, a5 @@ -2269,20 +2269,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a0, 56(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ld a1, 56(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a4, a4, a5 @@ -2305,20 +2305,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 16(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 24(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: xor a2, a2, a6 +; CHECK-UNALIGNED-RV64-V-NEXT: ld a6, 32(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t2, 40(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t3, 48(a0) +; CHECK-UNALIGNED-RV64-V-NEXT: ld a0, 56(a0) ; CHECK-UNALIGNED-RV64-V-NEXT: xor a3, a3, a7 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a4, a4, t0 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a5, a5, t1 -; CHECK-UNALIGNED-RV64-V-NEXT: ld a6, 32(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld a7, 40(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 48(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld a0, 56(a0) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 32(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t2, 40(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: ld t3, 48(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld a7, 32(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t0, 40(a1) +; CHECK-UNALIGNED-RV64-V-NEXT: ld t1, 48(a1) ; CHECK-UNALIGNED-RV64-V-NEXT: ld a1, 56(a1) -; CHECK-UNALIGNED-RV64-V-NEXT: xor a6, a6, t1 -; CHECK-UNALIGNED-RV64-V-NEXT: xor a7, a7, t2 -; CHECK-UNALIGNED-RV64-V-NEXT: xor t0, t0, t3 +; CHECK-UNALIGNED-RV64-V-NEXT: xor a6, a6, a7 +; CHECK-UNALIGNED-RV64-V-NEXT: xor a7, t2, t0 +; CHECK-UNALIGNED-RV64-V-NEXT: xor t0, t3, t1 ; CHECK-UNALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-UNALIGNED-RV64-V-NEXT: or a2, a2, a3 ; CHECK-UNALIGNED-RV64-V-NEXT: or a4, a4, a5 @@ -2979,8 +2979,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a1, a1, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: ret @@ -2990,8 +2990,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -3001,8 +3001,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a0, a0, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a1, a1, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: ret @@ -3012,8 +3012,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sub a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret @@ -3148,8 +3148,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -3168,8 +3168,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -3188,8 +3188,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -3208,8 +3208,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB24_2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -3364,8 +3364,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a1, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 @@ -3388,8 +3388,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a1, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 @@ -3544,8 +3544,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB26_2 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 @@ -3582,8 +3582,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB26_2 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 @@ -3733,8 +3733,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: # %bb.2: @@ -3751,16 +3751,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: @@ -3783,8 +3783,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a2, a2, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: srli a3, a3, 16 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: # %bb.2: @@ -3801,16 +3801,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a0, 4(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lh a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 48 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB27_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: @@ -3974,16 +3974,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: # %bb.2: @@ -4022,16 +4022,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.1: # %loadbb1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 3(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 3(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a2, a2, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a3, a3, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: bne a2, a3, .LBB28_3 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: # %bb.2: @@ -5809,25 +5809,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret @@ -5835,25 +5835,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret @@ -5861,25 +5861,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret @@ -5887,25 +5887,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -5916,16 +5916,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a1, 3(a1) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 0(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 1(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 2(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 1(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 2(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 0(a0) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a1, a4, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a2, a3 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a6, a7 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a0, a5 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a7, a0 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a5, a6 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret @@ -5936,20 +5936,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 0(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 1(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a2, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a5, a5, a6 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 24 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 2(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a3, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a5, a5, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a5 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret @@ -5957,25 +5957,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret @@ -5983,25 +5983,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 2(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) ; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 16 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 ; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret @@ -6144,8 +6144,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -6164,8 +6164,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a0, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret @@ -6322,8 +6322,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a0, a1, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: ret @@ -6342,8 +6342,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a0, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a1, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a1, a1, 32 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a0, a1, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 5b8955ee0e0a0..1ab3722080f70 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -26,8 +26,8 @@ define i32 @t0() { ; RV32-NEXT: lui a0, %hi(src) ; RV32-NEXT: lw a1, %lo(src)(a0) ; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: sw a1, %lo(dst)(a2) ; RV32-NEXT: addi a0, a0, %lo(src) +; RV32-NEXT: sw a1, %lo(dst)(a2) ; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lh a3, 8(a0) ; RV32-NEXT: lbu a0, 10(a0) @@ -41,13 +41,13 @@ define i32 @t0() { ; RV64-LABEL: t0: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lui a0, %hi(src) -; RV64-NEXT: ld a1, %lo(src)(a0) -; RV64-NEXT: lui a2, %hi(dst) +; RV64-NEXT: lui a1, %hi(dst) +; RV64-NEXT: ld a2, %lo(src)(a0) ; RV64-NEXT: addi a0, a0, %lo(src) ; RV64-NEXT: lh a3, 8(a0) ; RV64-NEXT: lbu a0, 10(a0) -; RV64-NEXT: sd a1, %lo(dst)(a2) -; RV64-NEXT: addi a1, a2, %lo(dst) +; RV64-NEXT: sd a2, %lo(dst)(a1) +; RV64-NEXT: addi a1, a1, %lo(dst) ; RV64-NEXT: sh a3, 8(a1) ; RV64-NEXT: sb a0, 10(a1) ; RV64-NEXT: li a0, 0 @@ -103,29 +103,29 @@ define void @t1(ptr nocapture %C) nounwind { ; RV32-FAST-LABEL: t1: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lui a1, 1141 +; RV32-FAST-NEXT: lui a2, 300325 +; RV32-FAST-NEXT: lui a3, 132181 +; RV32-FAST-NEXT: lui a4, 340483 +; RV32-FAST-NEXT: lui a5, 267556 +; RV32-FAST-NEXT: lui a6, 337154 ; RV32-FAST-NEXT: addi a1, a1, -439 ; RV32-FAST-NEXT: sw a1, 27(a0) -; RV32-FAST-NEXT: lui a1, 300325 +; RV32-FAST-NEXT: lui a1, 320757 +; RV32-FAST-NEXT: addi a2, a2, 1107 +; RV32-FAST-NEXT: addi a3, a3, -689 +; RV32-FAST-NEXT: addi a4, a4, -947 +; RV32-FAST-NEXT: sw a4, 16(a0) +; RV32-FAST-NEXT: sw a3, 20(a0) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lui a2, 365861 +; RV32-FAST-NEXT: addi a3, a5, 1871 +; RV32-FAST-NEXT: addi a4, a6, 69 ; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: lui a2, 132181 -; RV32-FAST-NEXT: addi a2, a2, -689 -; RV32-FAST-NEXT: lui a3, 340483 -; RV32-FAST-NEXT: addi a3, a3, -947 -; RV32-FAST-NEXT: sw a3, 16(a0) -; RV32-FAST-NEXT: sw a2, 20(a0) -; RV32-FAST-NEXT: sw a1, 24(a0) -; RV32-FAST-NEXT: lui a1, 267556 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: lui a2, 337154 -; RV32-FAST-NEXT: addi a2, a2, 69 -; RV32-FAST-NEXT: lui a3, 320757 -; RV32-FAST-NEXT: addi a3, a3, 1107 -; RV32-FAST-NEXT: lui a4, 365861 -; RV32-FAST-NEXT: addi a4, a4, -1980 -; RV32-FAST-NEXT: sw a4, 0(a0) -; RV32-FAST-NEXT: sw a3, 4(a0) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: addi a2, a2, -1980 +; RV32-FAST-NEXT: sw a2, 0(a0) +; RV32-FAST-NEXT: sw a1, 4(a0) +; RV32-FAST-NEXT: sw a4, 8(a0) +; RV32-FAST-NEXT: sw a3, 12(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: t1: @@ -164,16 +164,16 @@ define void @t2(ptr nocapture %C) nounwind { ; RV64-FAST-LABEL: t2: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: lui a1, %hi(.L.str2) -; RV64-FAST-NEXT: ld a2, %lo(.L.str2)(a1) -; RV64-FAST-NEXT: lui a3, 1156 -; RV64-FAST-NEXT: addi a3, a3, 332 -; RV64-FAST-NEXT: sw a3, 32(a0) +; RV64-FAST-NEXT: lui a2, 1156 +; RV64-FAST-NEXT: ld a3, %lo(.L.str2)(a1) +; RV64-FAST-NEXT: addi a2, a2, 332 ; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: ld a3, 8(a1) +; RV64-FAST-NEXT: sw a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: ld a4, 16(a1) ; RV64-FAST-NEXT: ld a1, 24(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 8(a0) +; RV64-FAST-NEXT: sd a3, 0(a0) +; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: sd a4, 16(a0) ; RV64-FAST-NEXT: sd a1, 24(a0) ; RV64-FAST-NEXT: ret @@ -200,23 +200,23 @@ define void @t3(ptr nocapture %C) nounwind { ; RV32-FAST-LABEL: t3: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lui a1, 1109 -; RV32-FAST-NEXT: addi a1, a1, -689 ; RV32-FAST-NEXT: lui a2, 340483 +; RV32-FAST-NEXT: lui a3, 267556 +; RV32-FAST-NEXT: lui a4, 337154 +; RV32-FAST-NEXT: lui a5, 320757 +; RV32-FAST-NEXT: addi a1, a1, -689 ; RV32-FAST-NEXT: addi a2, a2, -947 ; RV32-FAST-NEXT: sw a2, 16(a0) ; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 267556 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: lui a2, 337154 -; RV32-FAST-NEXT: addi a2, a2, 69 -; RV32-FAST-NEXT: lui a3, 320757 -; RV32-FAST-NEXT: addi a3, a3, 1107 -; RV32-FAST-NEXT: lui a4, 365861 -; RV32-FAST-NEXT: addi a4, a4, -1980 -; RV32-FAST-NEXT: sw a4, 0(a0) -; RV32-FAST-NEXT: sw a3, 4(a0) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a2, a3, 1871 +; RV32-FAST-NEXT: addi a3, a4, 69 +; RV32-FAST-NEXT: addi a4, a5, 1107 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: t3: @@ -253,19 +253,19 @@ define void @t4(ptr nocapture %C) nounwind { ; RV32-FAST-LABEL: t4: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: li a1, 32 +; RV32-FAST-NEXT: lui a2, 132388 +; RV32-FAST-NEXT: lui a3, 337154 +; RV32-FAST-NEXT: lui a4, 320757 ; RV32-FAST-NEXT: sh a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 132388 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: lui a2, 337154 -; RV32-FAST-NEXT: addi a2, a2, 69 -; RV32-FAST-NEXT: lui a3, 320757 -; RV32-FAST-NEXT: addi a3, a3, 1107 -; RV32-FAST-NEXT: lui a4, 365861 -; RV32-FAST-NEXT: addi a4, a4, -1980 -; RV32-FAST-NEXT: sw a4, 0(a0) -; RV32-FAST-NEXT: sw a3, 4(a0) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a2, a2, 1871 +; RV32-FAST-NEXT: addi a3, a3, 69 +; RV32-FAST-NEXT: addi a4, a4, 1107 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: t4: @@ -289,34 +289,34 @@ define void @t5(ptr nocapture %C) nounwind { ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a1, 84 ; RV32-NEXT: li a2, 83 +; RV32-NEXT: li a3, 89 +; RV32-NEXT: li a4, 82 +; RV32-NEXT: li a5, 72 +; RV32-NEXT: li a6, 68 ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: sb a1, 5(a0) ; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: li a1, 89 -; RV32-NEXT: li a2, 82 -; RV32-NEXT: li a3, 72 -; RV32-NEXT: li a4, 68 -; RV32-NEXT: sb a4, 0(a0) -; RV32-NEXT: sb a3, 1(a0) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: sb a6, 0(a0) +; RV32-NEXT: sb a5, 1(a0) +; RV32-NEXT: sb a4, 2(a0) +; RV32-NEXT: sb a3, 3(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: t5: ; RV64: # %bb.0: # %entry ; RV64-NEXT: li a1, 84 ; RV64-NEXT: li a2, 83 +; RV64-NEXT: li a3, 89 +; RV64-NEXT: li a4, 82 +; RV64-NEXT: li a5, 72 +; RV64-NEXT: li a6, 68 ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: sb a1, 5(a0) ; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: li a1, 89 -; RV64-NEXT: li a2, 82 -; RV64-NEXT: li a3, 72 -; RV64-NEXT: li a4, 68 -; RV64-NEXT: sb a4, 0(a0) -; RV64-NEXT: sb a3, 1(a0) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: sb a1, 3(a0) +; RV64-NEXT: sb a6, 0(a0) +; RV64-NEXT: sb a5, 1(a0) +; RV64-NEXT: sb a4, 2(a0) +; RV64-NEXT: sb a3, 3(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: t5: diff --git a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir index 08df378f27189..21398d315ec93 100644 --- a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir +++ b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir @@ -1,15 +1,15 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -verify-misched -enable-post-misched=false \ # RUN: -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \ -# RUN: -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \ +# RUN: -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \ # RUN: | FileCheck -check-prefix=NOPOSTMISCHED %s # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \ # RUN: -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \ -# RUN: -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \ +# RUN: -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \ # RUN: | FileCheck -check-prefix=NOCLUSTER %s # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \ # RUN: -debug-only=machine-scheduler \ -# RUN: -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \ +# RUN: -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \ # RUN: | FileCheck -check-prefix=MEMCLUSTER %s # REQUIRES: asserts diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index e9b84b3cd97ed..548c7e1c6ea8c 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -142,8 +142,8 @@ define i64 @mul64(i64 %a, i64 %b) nounwind { ; RV32IM: # %bb.0: ; RV32IM-NEXT: mul a3, a0, a3 ; RV32IM-NEXT: mulhu a4, a0, a2 -; RV32IM-NEXT: add a3, a4, a3 ; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: add a3, a4, a3 ; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: mul a0, a0, a2 ; RV32IM-NEXT: ret @@ -163,26 +163,25 @@ define i64 @mul64(i64 %a, i64 %b) nounwind { define i64 @mul64_constant(i64 %a) nounwind { ; RV32I-LABEL: mul64_constant: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a3, a0, 2 -; RV32I-NEXT: add a2, a3, a0 -; RV32I-NEXT: sltu a3, a2, a3 -; RV32I-NEXT: srli a0, a0, 30 +; RV32I-NEXT: slli a2, a0, 2 +; RV32I-NEXT: srli a3, a0, 30 ; RV32I-NEXT: slli a4, a1, 2 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: add a1, a0, a3 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: mul64_constant: ; RV32IM: # %bb.0: ; RV32IM-NEXT: li a2, 5 -; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 2 ; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: slli a3, a0, 2 +; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: slli a2, a0, 2 -; RV32IM-NEXT: add a0, a2, a0 +; RV32IM-NEXT: add a0, a3, a0 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: mul64_constant: @@ -251,13 +250,13 @@ define i32 @mulhs_positive_constant(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a1, a0, 31 ; RV32I-NEXT: slli a2, a0, 2 -; RV32I-NEXT: add a3, a2, a0 -; RV32I-NEXT: sltu a2, a3, a2 -; RV32I-NEXT: srli a0, a0, 30 -; RV32I-NEXT: slli a3, a1, 2 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: srli a3, a0, 30 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a0, a0, a2 +; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: mulhs_positive_constant: @@ -293,15 +292,15 @@ define i32 @mulhs_negative_constant(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a1, a0, 31 ; RV32I-NEXT: slli a2, a0, 2 -; RV32I-NEXT: add a3, a2, a0 -; RV32I-NEXT: sltu a2, a3, a2 -; RV32I-NEXT: srli a0, a0, 30 +; RV32I-NEXT: srli a3, a0, 30 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: slli a4, a1, 2 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; @@ -704,26 +703,25 @@ define i32 @muli32_p63(i32 %a) nounwind { define i64 @muli64_p65(i64 %a) nounwind { ; RV32I-LABEL: muli64_p65: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a3, a0, 6 -; RV32I-NEXT: add a2, a3, a0 -; RV32I-NEXT: sltu a3, a2, a3 -; RV32I-NEXT: srli a0, a0, 26 +; RV32I-NEXT: slli a2, a0, 6 +; RV32I-NEXT: srli a3, a0, 26 ; RV32I-NEXT: slli a4, a1, 6 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: add a1, a0, a3 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_p65: ; RV32IM: # %bb.0: ; RV32IM-NEXT: li a2, 65 -; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 6 ; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: slli a3, a0, 6 +; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: slli a2, a0, 6 -; RV32IM-NEXT: add a0, a2, a0 +; RV32IM-NEXT: add a0, a3, a0 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: muli64_p65: @@ -745,24 +743,24 @@ define i64 @muli64_p63(i64 %a) nounwind { ; RV32I-LABEL: muli64_p63: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a0, 6 -; RV32I-NEXT: sltu a3, a2, a0 -; RV32I-NEXT: srli a4, a0, 26 -; RV32I-NEXT: slli a5, a1, 6 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: sub a1, a4, a1 -; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: srli a3, a0, 26 +; RV32I-NEXT: slli a4, a1, 6 +; RV32I-NEXT: sltu a5, a2, a0 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_p63: ; RV32IM: # %bb.0: ; RV32IM-NEXT: li a2, 63 -; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 6 ; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: slli a3, a0, 6 +; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: slli a2, a0, 6 -; RV32IM-NEXT: sub a0, a2, a0 +; RV32IM-NEXT: sub a0, a3, a0 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: muli64_p63: @@ -846,12 +844,12 @@ define i64 @muli64_m63(i64 %a) nounwind { ; RV32I-LABEL: muli64_m63: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a0, 6 -; RV32I-NEXT: sltu a3, a0, a2 -; RV32I-NEXT: srli a4, a0, 26 -; RV32I-NEXT: slli a5, a1, 6 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: srli a3, a0, 26 +; RV32I-NEXT: slli a4, a1, 6 +; RV32I-NEXT: sltu a5, a0, a2 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a1, a5 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; @@ -886,17 +884,17 @@ define i64 @muli64_m65(i64 %a) nounwind { ; RV32I-LABEL: muli64_m65: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a0, 6 -; RV32I-NEXT: add a3, a2, a0 -; RV32I-NEXT: sltu a2, a3, a2 -; RV32I-NEXT: srli a0, a0, 26 +; RV32I-NEXT: srli a3, a0, 26 ; RV32I-NEXT: slli a4, a1, 6 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a1, a1, a0 -; RV32I-NEXT: neg a0, a3 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: snez a3, a0 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: neg a2, a3 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_m65: @@ -1129,11 +1127,11 @@ define i64 @muli64_p4352(i64 %a) nounwind { ; RV32I-NEXT: srli a3, a0, 20 ; RV32I-NEXT: slli a1, a1, 12 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: slli a3, a0, 8 +; RV32I-NEXT: slli a4, a0, 12 +; RV32I-NEXT: add a0, a4, a3 ; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: slli a3, a0, 12 -; RV32I-NEXT: add a0, a3, a2 -; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: sltu a2, a0, a4 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -1173,12 +1171,12 @@ define i64 @muli64_p3840(i64 %a) nounwind { ; RV32I-NEXT: srli a3, a0, 20 ; RV32I-NEXT: slli a1, a1, 12 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a0, 8 ; RV32I-NEXT: slli a0, a0, 12 -; RV32I-NEXT: sltu a3, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a3 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_p3840: @@ -1261,12 +1259,12 @@ define i64 @muli64_m3840(i64 %a) nounwind { ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 12 +; RV32I-NEXT: slli a3, a0, 12 ; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: sltu a3, a0, a2 -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, a0, a3 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_m3840: @@ -1300,105 +1298,103 @@ define i64 @muli64_m3840(i64 %a) nounwind { define i128 @muli128_m3840(i128 %a) nounwind { ; RV32I-LABEL: muli128_m3840: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw a6, 0(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: srli a1, a4, 20 -; RV32I-NEXT: slli a2, a3, 12 -; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: srli a2, a4, 24 -; RV32I-NEXT: slli a7, a3, 8 -; RV32I-NEXT: or a2, a7, a2 -; RV32I-NEXT: sltu t0, a2, a1 -; RV32I-NEXT: srli a7, a3, 20 -; RV32I-NEXT: slli t1, a5, 12 -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: sub t1, a3, a7 -; RV32I-NEXT: srli a3, a6, 20 -; RV32I-NEXT: slli a5, a4, 12 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: srli a5, a6, 24 +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a2, 8(a1) +; RV32I-NEXT: lw a5, 0(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: srli a1, a3, 20 +; RV32I-NEXT: slli a6, a2, 12 +; RV32I-NEXT: srli a7, a3, 24 +; RV32I-NEXT: slli t0, a2, 8 +; RV32I-NEXT: srli t1, a2, 20 +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: slli a6, a4, 12 +; RV32I-NEXT: srli t2, a2, 24 ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: slli a4, a6, 12 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: sltu a7, a6, a4 -; RV32I-NEXT: sub t0, t1, t0 +; RV32I-NEXT: or a2, t0, a7 +; RV32I-NEXT: srli a7, a5, 20 +; RV32I-NEXT: or a6, a6, t1 +; RV32I-NEXT: slli t0, a3, 12 +; RV32I-NEXT: or t1, a4, t2 +; RV32I-NEXT: srli t2, a5, 24 +; RV32I-NEXT: slli t3, a3, 8 +; RV32I-NEXT: or a3, t0, a7 +; RV32I-NEXT: slli a4, a5, 12 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: sltu t2, a2, a1 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sltu a7, a5, a4 +; RV32I-NEXT: sub a6, a6, t2 ; RV32I-NEXT: mv t1, a7 -; RV32I-NEXT: beq a5, a3, .LBB36_2 +; RV32I-NEXT: beq t0, a3, .LBB36_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a5, a3 +; RV32I-NEXT: sltu t1, t0, a3 ; RV32I-NEXT: .LBB36_2: ; RV32I-NEXT: sub a2, a2, a1 -; RV32I-NEXT: sltu a1, a2, t1 -; RV32I-NEXT: sub a1, t0, a1 +; RV32I-NEXT: sub a1, t0, a3 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sltu a3, a2, t1 ; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sub a5, a5, a3 -; RV32I-NEXT: sub a3, a5, a7 -; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a3, a6, a3 +; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sw a1, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a3, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m3840: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32IM-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw a2, 0(a1) -; RV32IM-NEXT: lw a3, 4(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a2, 4(a1) ; RV32IM-NEXT: lw a4, 8(a1) ; RV32IM-NEXT: lw a1, 12(a1) ; RV32IM-NEXT: li a5, -15 +; RV32IM-NEXT: li a6, -1 ; RV32IM-NEXT: slli a5, a5, 8 -; RV32IM-NEXT: mulhu a6, a2, a5 -; RV32IM-NEXT: mul a7, a3, a5 -; RV32IM-NEXT: add a6, a7, a6 -; RV32IM-NEXT: sltu a7, a6, a7 -; RV32IM-NEXT: mulhu t0, a3, a5 -; RV32IM-NEXT: add a7, t0, a7 -; RV32IM-NEXT: sub a6, a6, a2 -; RV32IM-NEXT: neg t0, a2 -; RV32IM-NEXT: sltu t1, a6, t0 -; RV32IM-NEXT: li t2, -1 -; RV32IM-NEXT: mulhu t3, a2, t2 -; RV32IM-NEXT: add t1, t3, t1 -; RV32IM-NEXT: add t1, a7, t1 -; RV32IM-NEXT: sub t4, t1, a3 -; RV32IM-NEXT: mul t5, a4, a5 -; RV32IM-NEXT: sub t5, t5, a2 -; RV32IM-NEXT: add t6, t4, t5 -; RV32IM-NEXT: sltu s0, t6, t4 -; RV32IM-NEXT: neg s1, a3 -; RV32IM-NEXT: sltu t4, t4, s1 -; RV32IM-NEXT: sltu a7, t1, a7 -; RV32IM-NEXT: mulhu t1, a3, t2 -; RV32IM-NEXT: add a7, t1, a7 -; RV32IM-NEXT: add a7, a7, t4 -; RV32IM-NEXT: sltu t0, t5, t0 +; RV32IM-NEXT: mulhu a7, a3, a5 +; RV32IM-NEXT: mul t0, a2, a5 +; RV32IM-NEXT: mulhu t1, a2, a5 +; RV32IM-NEXT: neg t2, a3 +; RV32IM-NEXT: mulhu t3, a3, a6 +; RV32IM-NEXT: mul t4, a4, a5 +; RV32IM-NEXT: neg t5, a2 +; RV32IM-NEXT: mulhu a6, a2, a6 ; RV32IM-NEXT: mul a1, a1, a5 -; RV32IM-NEXT: mulhu t1, a4, a5 -; RV32IM-NEXT: sub a4, t1, a4 +; RV32IM-NEXT: mulhu t6, a4, a5 +; RV32IM-NEXT: add s0, a3, a2 +; RV32IM-NEXT: mul a5, a3, a5 +; RV32IM-NEXT: add a7, t0, a7 +; RV32IM-NEXT: sub t4, t4, a3 +; RV32IM-NEXT: sub a4, t6, a4 +; RV32IM-NEXT: sub t6, t3, s0 +; RV32IM-NEXT: sltu t0, a7, t0 +; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: sltu a7, t4, t2 ; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: add a3, a2, a3 -; RV32IM-NEXT: sub a3, t3, a3 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a1, a1, t0 -; RV32IM-NEXT: add a1, a7, a1 -; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sw a2, 0(a0) -; RV32IM-NEXT: sw a6, 4(a0) -; RV32IM-NEXT: sw t6, 8(a0) +; RV32IM-NEXT: add t0, t1, t0 +; RV32IM-NEXT: sltu a4, a3, t2 +; RV32IM-NEXT: add a1, t6, a1 +; RV32IM-NEXT: add a4, t3, a4 +; RV32IM-NEXT: add a1, a1, a7 +; RV32IM-NEXT: add a4, t0, a4 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: sltu a4, a4, t0 +; RV32IM-NEXT: add t4, a2, t4 +; RV32IM-NEXT: sltu a7, a2, t5 +; RV32IM-NEXT: add a4, a6, a4 +; RV32IM-NEXT: sltu a2, t4, a2 +; RV32IM-NEXT: add a4, a4, a7 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: sw a5, 0(a0) +; RV32IM-NEXT: sw a3, 4(a0) +; RV32IM-NEXT: sw t4, 8(a0) ; RV32IM-NEXT: sw a1, 12(a0) ; RV32IM-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32IM-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32IM-NEXT: addi sp, sp, 16 ; RV32IM-NEXT: ret ; @@ -1410,12 +1406,12 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: slli a2, a0, 12 +; RV64I-NEXT: slli a3, a0, 12 ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: sltu a3, a0, a2 -; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: sltu a2, a0, a3 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a3 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli128_m3840: @@ -1435,40 +1431,40 @@ define i128 @muli128_m3840(i128 %a) nounwind { define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-LABEL: muli128_m63: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: slli a1, a2, 6 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: srli a7, a2, 26 -; RV32I-NEXT: slli t0, a3, 6 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: mv t0, a4 -; RV32I-NEXT: beq a3, a7, .LBB37_2 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a2, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: slli a6, a3, 6 +; RV32I-NEXT: srli a5, a3, 26 +; RV32I-NEXT: slli t0, a4, 6 +; RV32I-NEXT: sltu a7, a3, a6 +; RV32I-NEXT: or t0, t0, a5 +; RV32I-NEXT: mv a5, a7 +; RV32I-NEXT: beq a4, t0, .LBB37_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a3, a7 +; RV32I-NEXT: sltu a5, a4, t0 ; RV32I-NEXT: .LBB37_2: -; RV32I-NEXT: srli t1, a3, 26 -; RV32I-NEXT: slli t2, a6, 6 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: sub t2, a6, t1 -; RV32I-NEXT: sltu t3, t2, t0 -; RV32I-NEXT: sltu t1, a6, t1 -; RV32I-NEXT: srli a6, a6, 26 -; RV32I-NEXT: slli t4, a5, 6 -; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: sub a5, a5, a6 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a5, a5, t3 -; RV32I-NEXT: sub a6, t2, t0 -; RV32I-NEXT: sub a3, a3, a7 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: sub a2, a2, a1 -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: srli t1, a4, 26 +; RV32I-NEXT: slli t2, a2, 6 +; RV32I-NEXT: srli t3, a2, 26 +; RV32I-NEXT: slli t4, a1, 6 +; RV32I-NEXT: sub a4, a4, t0 +; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or t0, t4, t3 +; RV32I-NEXT: sub a4, a4, a7 +; RV32I-NEXT: sub a7, a2, a6 +; RV32I-NEXT: sltu a2, a2, a6 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sltu a6, a7, a5 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a2, a7, a5 +; RV32I-NEXT: sub a1, a1, a6 +; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a4, 4(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m63: @@ -1476,54 +1472,54 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw a2, 0(a1) -; RV32IM-NEXT: lw a3, 4(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a2, 4(a1) ; RV32IM-NEXT: lw a4, 8(a1) ; RV32IM-NEXT: lw a1, 12(a1) ; RV32IM-NEXT: li a5, -63 -; RV32IM-NEXT: mulhu a6, a2, a5 -; RV32IM-NEXT: slli a7, a3, 6 -; RV32IM-NEXT: sub a7, a3, a7 -; RV32IM-NEXT: add a6, a7, a6 -; RV32IM-NEXT: sltu a7, a6, a7 -; RV32IM-NEXT: mulhu t0, a3, a5 -; RV32IM-NEXT: add a7, t0, a7 -; RV32IM-NEXT: sub a6, a6, a2 -; RV32IM-NEXT: neg t0, a2 -; RV32IM-NEXT: sltu t1, a6, t0 -; RV32IM-NEXT: li t2, -1 -; RV32IM-NEXT: mulhu t3, a2, t2 -; RV32IM-NEXT: add t1, t3, t1 -; RV32IM-NEXT: add t1, a7, t1 -; RV32IM-NEXT: sub t4, t1, a3 -; RV32IM-NEXT: slli t5, a4, 6 -; RV32IM-NEXT: sub t6, a4, a2 -; RV32IM-NEXT: sub t5, t6, t5 -; RV32IM-NEXT: add t6, t4, t5 -; RV32IM-NEXT: sltu s0, t6, t4 -; RV32IM-NEXT: neg s1, a3 -; RV32IM-NEXT: sltu t4, t4, s1 -; RV32IM-NEXT: sltu a7, t1, a7 -; RV32IM-NEXT: mulhu t1, a3, t2 -; RV32IM-NEXT: add a7, t1, a7 -; RV32IM-NEXT: add a7, a7, t4 -; RV32IM-NEXT: sltu t0, t5, t0 -; RV32IM-NEXT: slli t1, a1, 6 -; RV32IM-NEXT: sub a1, a1, t1 +; RV32IM-NEXT: li a6, -1 +; RV32IM-NEXT: mulhu a7, a3, a5 +; RV32IM-NEXT: slli t0, a2, 6 +; RV32IM-NEXT: mulhu t1, a2, a5 +; RV32IM-NEXT: neg t2, a3 +; RV32IM-NEXT: mulhu t3, a3, a6 +; RV32IM-NEXT: slli t4, a4, 6 +; RV32IM-NEXT: sub t5, a4, a3 +; RV32IM-NEXT: neg t6, a2 +; RV32IM-NEXT: mulhu a6, a2, a6 +; RV32IM-NEXT: slli s0, a1, 6 ; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: add s1, a3, a2 +; RV32IM-NEXT: sub t4, t5, t4 +; RV32IM-NEXT: slli t5, a3, 6 +; RV32IM-NEXT: sub t0, a2, t0 +; RV32IM-NEXT: sub a1, a1, s0 ; RV32IM-NEXT: sub a5, a5, a4 +; RV32IM-NEXT: sub a4, t3, s1 +; RV32IM-NEXT: sub t5, a3, t5 +; RV32IM-NEXT: add a7, t0, a7 +; RV32IM-NEXT: sltu s0, t4, t2 ; RV32IM-NEXT: add a1, a5, a1 -; RV32IM-NEXT: add a3, a2, a3 -; RV32IM-NEXT: sub a3, t3, a3 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a1, a1, t0 -; RV32IM-NEXT: add a1, a7, a1 +; RV32IM-NEXT: sltu a5, a7, t0 +; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a5, t1, a5 +; RV32IM-NEXT: sltu a4, a3, t2 ; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: slli a3, a2, 6 -; RV32IM-NEXT: sub a2, a2, a3 -; RV32IM-NEXT: sw a2, 0(a0) -; RV32IM-NEXT: sw a6, 4(a0) -; RV32IM-NEXT: sw t6, 8(a0) +; RV32IM-NEXT: add a4, t3, a4 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: sltu a4, a4, a5 +; RV32IM-NEXT: add t4, a2, t4 +; RV32IM-NEXT: sltu a5, a2, t6 +; RV32IM-NEXT: add a4, a6, a4 +; RV32IM-NEXT: sltu a2, t4, a2 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: sw t5, 0(a0) +; RV32IM-NEXT: sw a3, 4(a0) +; RV32IM-NEXT: sw t4, 8(a0) ; RV32IM-NEXT: sw a1, 12(a0) ; RV32IM-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -1533,12 +1529,12 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV64I-LABEL: muli128_m63: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a2, a0, 6 -; RV64I-NEXT: sltu a3, a0, a2 -; RV64I-NEXT: srli a4, a0, 58 -; RV64I-NEXT: slli a5, a1, 6 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: srli a3, a0, 58 +; RV64I-NEXT: slli a4, a1, 6 +; RV64I-NEXT: sltu a5, a0, a2 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: sub a1, a1, a5 ; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; @@ -1619,17 +1615,17 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: call __muldi3 ; RV32I-NEXT: add s2, a0, s2 -; RV32I-NEXT: add a2, s9, s2 -; RV32I-NEXT: sltu a3, a2, s9 -; RV32I-NEXT: sltu a4, s9, s5 -; RV32I-NEXT: sltu a5, s8, s7 -; RV32I-NEXT: add a5, s6, a5 -; RV32I-NEXT: add a4, a5, a4 +; RV32I-NEXT: sltu a3, s9, s5 +; RV32I-NEXT: sltu a4, s8, s7 ; RV32I-NEXT: add a1, a1, s3 +; RV32I-NEXT: add a2, s9, s2 +; RV32I-NEXT: add a4, s6, a4 ; RV32I-NEXT: sltu a0, s2, a0 +; RV32I-NEXT: sltu a5, a2, s9 +; RV32I-NEXT: add a3, a4, a3 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: add a0, a4, a0 -; RV32I-NEXT: add a1, a0, a3 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: add a1, a0, a5 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload @@ -1650,33 +1646,32 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind { ; RV32IM-NEXT: srai a4, a3, 31 ; RV32IM-NEXT: mulhu a5, a0, a2 ; RV32IM-NEXT: mul a6, a1, a2 -; RV32IM-NEXT: add a5, a6, a5 -; RV32IM-NEXT: sltu a6, a5, a6 ; RV32IM-NEXT: mulhu a2, a1, a2 -; RV32IM-NEXT: add a6, a2, a6 -; RV32IM-NEXT: mul a2, a0, a3 -; RV32IM-NEXT: add a5, a2, a5 -; RV32IM-NEXT: sltu a2, a5, a2 -; RV32IM-NEXT: mulhu a5, a0, a3 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a5, a6, a2 -; RV32IM-NEXT: mul a7, a1, a3 -; RV32IM-NEXT: add t0, a7, a5 -; RV32IM-NEXT: mul t1, a4, a0 -; RV32IM-NEXT: add a2, t0, t1 -; RV32IM-NEXT: sltu t2, a2, t0 -; RV32IM-NEXT: sltu a7, t0, a7 -; RV32IM-NEXT: sltu a5, a5, a6 +; RV32IM-NEXT: mul a7, a0, a3 +; RV32IM-NEXT: mulhu t0, a0, a3 +; RV32IM-NEXT: mul t1, a1, a3 ; RV32IM-NEXT: mulhu a3, a1, a3 -; RV32IM-NEXT: add a3, a3, a5 -; RV32IM-NEXT: add a3, a3, a7 +; RV32IM-NEXT: add a5, a6, a5 +; RV32IM-NEXT: mul t2, a4, a0 ; RV32IM-NEXT: mul a1, a4, a1 ; RV32IM-NEXT: mulhu a0, a4, a0 -; RV32IM-NEXT: add a0, a0, a1 -; RV32IM-NEXT: add a0, a0, t1 -; RV32IM-NEXT: add a0, a3, a0 -; RV32IM-NEXT: add a1, a0, t2 -; RV32IM-NEXT: mv a0, a2 +; RV32IM-NEXT: sltu a4, a5, a6 +; RV32IM-NEXT: add a5, a7, a5 +; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: sltu a0, a5, a7 +; RV32IM-NEXT: add a0, t0, a0 +; RV32IM-NEXT: add a0, a2, a0 +; RV32IM-NEXT: add a4, t1, a0 +; RV32IM-NEXT: sltu a2, a0, a2 +; RV32IM-NEXT: add a0, a4, t2 +; RV32IM-NEXT: sltu a5, a4, t1 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: sltu a3, a0, a4 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: add a1, a1, t2 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: add a1, a1, a3 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: mulhsu_i64: diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index 676b413446116..7d6a6d7ed4ce6 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -81,8 +81,8 @@ define i64 @neg_abs64(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a2, a1, 31 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 +; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 @@ -92,8 +92,8 @@ define i64 @neg_abs64(i64 %x) { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: srai a2, a1, 31 ; RV32ZBB-NEXT: xor a0, a0, a2 -; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 +; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 @@ -121,8 +121,8 @@ define i64 @select_neg_abs64(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a2, a1, 31 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 +; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 @@ -132,8 +132,8 @@ define i64 @select_neg_abs64(i64 %x) { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: srai a2, a1, 31 ; RV32ZBB-NEXT: xor a0, a0, a2 -; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 +; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/or-is-add.ll b/llvm/test/CodeGen/RISCV/or-is-add.ll index 36a201d277675..73561675b17ec 100644 --- a/llvm/test/CodeGen/RISCV/or-is-add.ll +++ b/llvm/test/CodeGen/RISCV/or-is-add.ll @@ -58,8 +58,8 @@ define i64 @test4(i64 %x) { ; RV32: # %bb.0: ; RV32-NEXT: srli a2, a0, 28 ; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: or a1, a1, a2 ; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: or a1, a1, a2 ; RV32-NEXT: addi a0, a0, 13 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 95b106f4d35ba..5a01d43fea56b 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -693,13 +693,12 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) { ; RV32-LABEL: uaddo_i64_decrement_alt: ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: snez a0, a3 ; RV32-NEXT: sub a1, a1, a4 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: sw a5, 0(a2) ; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: mv a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo_i64_decrement_alt: @@ -721,13 +720,12 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) { ; RV32-LABEL: uaddo_i64_decrement_alt_dom: ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: snez a0, a3 ; RV32-NEXT: sub a1, a1, a4 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: sw a5, 0(a2) ; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: mv a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo_i64_decrement_alt_dom: @@ -800,10 +798,10 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { ; RV32-NEXT: mv a5, a0 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a6, a1, a3 -; RV32-NEXT: sub a6, a6, a0 ; RV32-NEXT: sub a5, a5, a2 +; RV32-NEXT: sub a2, a6, a0 ; RV32-NEXT: sw a5, 0(a4) -; RV32-NEXT: sw a6, 4(a4) +; RV32-NEXT: sw a2, 4(a4) ; RV32-NEXT: beq a1, a3, .LBB23_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: sltu a0, a1, a3 @@ -882,8 +880,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) { ; RV32-LABEL: usubo_ult_constant_op0_i16: ; RV32: # %bb.0: ; RV32-NEXT: slli a2, a0, 16 -; RV32-NEXT: srli a2, a2, 16 ; RV32-NEXT: li a3, 43 +; RV32-NEXT: srli a2, a2, 16 ; RV32-NEXT: sub a3, a3, a0 ; RV32-NEXT: sltiu a0, a2, 44 ; RV32-NEXT: xori a0, a0, 1 @@ -893,8 +891,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) { ; RV64-LABEL: usubo_ult_constant_op0_i16: ; RV64: # %bb.0: ; RV64-NEXT: slli a2, a0, 48 -; RV64-NEXT: srli a2, a2, 48 ; RV64-NEXT: li a3, 43 +; RV64-NEXT: srli a2, a2, 48 ; RV64-NEXT: subw a3, a3, a0 ; RV64-NEXT: sltiu a0, a2, 44 ; RV64-NEXT: xori a0, a0, 1 @@ -1015,10 +1013,10 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: mv a7, a0 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub t0, a1, a3 -; RV32-NEXT: sub t0, t0, a0 ; RV32-NEXT: sub a2, a7, a2 +; RV32-NEXT: sub a7, t0, a0 ; RV32-NEXT: sw a2, 0(a4) -; RV32-NEXT: sw t0, 4(a4) +; RV32-NEXT: sw a7, 4(a4) ; RV32-NEXT: beqz a6, .LBB31_5 ; RV32-NEXT: # %bb.2: # %end ; RV32-NEXT: beq a1, a3, .LBB31_4 @@ -1081,18 +1079,18 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s2, a5 -; RV32-NEXT: andi a5, a5, 1 -; RV32-NEXT: beqz a5, .LBB32_8 +; RV32-NEXT: mv s5, a5 +; RV32-NEXT: mv s3, a1 +; RV32-NEXT: andi a1, a5, 1 +; RV32-NEXT: beqz a1, .LBB32_8 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a3 ; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s5, a1 ; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq a1, a3, .LBB32_3 +; RV32-NEXT: beq s3, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s5, s3 +; RV32-NEXT: sltu s6, s3, s2 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: ; RV32-NEXT: sltu s6, s4, s1 @@ -1103,18 +1101,18 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: # %bb.5: # %end ; RV32-NEXT: sltu a1, s4, s1 ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s5, s3, .LBB32_7 +; RV32-NEXT: beq s3, s2, .LBB32_7 ; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s5, s3 +; RV32-NEXT: sltu a0, s3, s2 ; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s5, s3 +; RV32-NEXT: sub a2, s3, s2 +; RV32-NEXT: sub a3, s4, s1 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sub a1, s4, s1 -; RV32-NEXT: sw a1, 0(s0) +; RV32-NEXT: sw a3, 0(s0) ; RV32-NEXT: sw a2, 4(s0) ; RV32-NEXT: j .LBB32_9 ; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s2 +; RV32-NEXT: mv a0, s5 ; RV32-NEXT: .LBB32_9: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1153,13 +1151,13 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV64-NEXT: .cfi_offset s3, -40 ; RV64-NEXT: .cfi_offset s4, -48 ; RV64-NEXT: mv s0, a3 -; RV64-NEXT: andi a3, a3, 1 -; RV64-NEXT: beqz a3, .LBB32_3 +; RV64-NEXT: mv s2, a1 +; RV64-NEXT: andi a1, a3, 1 +; RV64-NEXT: beqz a1, .LBB32_3 ; RV64-NEXT: # %bb.1: # %t ; RV64-NEXT: mv s1, a2 -; RV64-NEXT: mv s2, a1 ; RV64-NEXT: mv s3, a0 -; RV64-NEXT: sltu s4, a0, a1 +; RV64-NEXT: sltu s4, a0, s2 ; RV64-NEXT: mv a0, s4 ; RV64-NEXT: call call ; RV64-NEXT: bgeu s3, s2, .LBB32_3 @@ -1275,8 +1273,8 @@ define void @PR41129(ptr %p64) { ; RV32-NEXT: ret ; RV32-NEXT: .LBB37_2: # %true ; RV32-NEXT: seqz a3, a1 -; RV32-NEXT: sub a2, a2, a3 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: sub a2, a2, a3 ; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: sw a2, 4(a0) ; RV32-NEXT: ret @@ -1316,9 +1314,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) { ; RV32-LABEL: overflow_not_used: ; RV32: # %bb.0: ; RV32-NEXT: lui a3, 16 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a4, a1, a3 -; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: and a3, a0, a3 ; RV32-NEXT: bltu a3, a4, .LBB38_2 ; RV32-NEXT: # %bb.1: @@ -1331,9 +1329,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) { ; RV64-LABEL: overflow_not_used: ; RV64: # %bb.0: ; RV64-NEXT: lui a3, 16 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: addiw a3, a3, -1 ; RV64-NEXT: and a4, a1, a3 -; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: and a3, a0, a3 ; RV64-NEXT: bltu a3, a4, .LBB38_2 ; RV64-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll index 8aa145f6ac5ef..8e858bdd29762 100644 --- a/llvm/test/CodeGen/RISCV/pr51206.ll +++ b/llvm/test/CodeGen/RISCV/pr51206.ll @@ -12,21 +12,21 @@ define signext i32 @wobble() nounwind { ; CHECK-LABEL: wobble: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: lui a0, %hi(global) +; CHECK-NEXT: lui a1, %hi(global.1) ; CHECK-NEXT: lbu a0, %lo(global)(a0) -; CHECK-NEXT: lui a1, %hi(global.2) -; CHECK-NEXT: lbu a1, %lo(global.2)(a1) +; CHECK-NEXT: lui a2, %hi(global.2) +; CHECK-NEXT: lui a3, 52429 +; CHECK-NEXT: lbu a2, %lo(global.2)(a2) ; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: lui a2, %hi(global.1) -; CHECK-NEXT: sw a0, %lo(global.1)(a2) -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: slli a1, a0, 48 -; CHECK-NEXT: lui a2, 52429 -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: mulhu a1, a1, a2 -; CHECK-NEXT: srli a1, a1, 18 -; CHECK-NEXT: lui a2, %hi(global.3) +; CHECK-NEXT: sw a0, %lo(global.1)(a1) +; CHECK-NEXT: lui a1, %hi(global.3) +; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: slli a2, a0, 48 +; CHECK-NEXT: mulhu a2, a2, a3 +; CHECK-NEXT: srli a2, a2, 18 ; CHECK-NEXT: li a3, 5 -; CHECK-NEXT: sw a1, %lo(global.3)(a2) +; CHECK-NEXT: sw a2, %lo(global.3)(a1) ; CHECK-NEXT: bgeu a0, a3, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb12 ; CHECK-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll index ba08aa838bf99..cf518b31a190b 100644 --- a/llvm/test/CodeGen/RISCV/pr56457.ll +++ b/llvm/test/CodeGen/RISCV/pr56457.ll @@ -10,41 +10,41 @@ define i15 @foo(i15 %x) nounwind { ; CHECK-NEXT: beqz a1, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %cond.false ; CHECK-NEXT: srli a1, a1, 50 +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: lui a3, 209715 +; CHECK-NEXT: lui a4, 61681 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: slli a1, a0, 49 -; CHECK-NEXT: srli a1, a1, 51 -; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: slli a1, a0, 49 -; CHECK-NEXT: srli a1, a1, 53 -; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: slli a1, a0, 49 -; CHECK-NEXT: srli a1, a1, 57 -; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: addiw a1, a2, 1365 +; CHECK-NEXT: addiw a2, a3, 819 +; CHECK-NEXT: addiw a3, a4, -241 +; CHECK-NEXT: slli a4, a2, 32 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: slli a4, a3, 32 +; CHECK-NEXT: add a3, a3, a4 +; CHECK-NEXT: slli a4, a0, 49 +; CHECK-NEXT: srli a4, a4, 51 +; CHECK-NEXT: or a0, a0, a4 +; CHECK-NEXT: slli a4, a0, 49 +; CHECK-NEXT: srli a4, a4, 53 +; CHECK-NEXT: or a0, a0, a4 +; CHECK-NEXT: slli a4, a0, 49 +; CHECK-NEXT: srli a4, a4, 57 +; CHECK-NEXT: or a0, a0, a4 ; CHECK-NEXT: not a0, a0 -; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: addiw a2, a2, 1365 -; CHECK-NEXT: and a1, a1, a2 +; CHECK-NEXT: srli a4, a0, 1 +; CHECK-NEXT: and a1, a4, a1 ; CHECK-NEXT: slli a0, a0, 49 ; CHECK-NEXT: srli a0, a0, 49 ; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: lui a1, 209715 -; CHECK-NEXT: addiw a1, a1, 819 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: and a2, a0, a1 +; CHECK-NEXT: and a1, a0, a2 ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: srli a1, a0, 4 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: addiw a1, a1, -241 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: lui a1, 4112 ; CHECK-NEXT: addiw a1, a1, 257 +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: slli a2, a1, 32 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: mul a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll index df02d77f61329..e5cba679729fa 100644 --- a/llvm/test/CodeGen/RISCV/pr58511.ll +++ b/llvm/test/CodeGen/RISCV/pr58511.ll @@ -5,8 +5,8 @@ define i32 @f(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: slli a0, a0, 63 -; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: lui a3, 4097 +; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: mul a1, a1, a3 @@ -24,8 +24,8 @@ define i32 @g(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: g: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: lui a3, 4097 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: mul a1, a1, a3 @@ -43,10 +43,10 @@ define i32 @h(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: h: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: lui a3, 4097 -; CHECK-NEXT: addiw a3, a3, -2047 -; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: srai a0, a0, 63 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret @@ -64,8 +64,8 @@ define i32 @i(i1 %0, i32 %1, ptr %2) { ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: lui a3, 4097 ; CHECK-NEXT: addiw a3, a3, -2047 -; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll index dcd71edc460b8..c6770b05da555 100644 --- a/llvm/test/CodeGen/RISCV/pr65025.ll +++ b/llvm/test/CodeGen/RISCV/pr65025.ll @@ -7,10 +7,10 @@ define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %va ; CHECK-NEXT: andi a3, a0, -4 ; CHECK-NEXT: slli a4, a0, 3 ; CHECK-NEXT: li a5, 255 -; CHECK-NEXT: sllw a5, a5, a4 ; CHECK-NEXT: andi a1, a1, 255 -; CHECK-NEXT: sllw a1, a1, a4 ; CHECK-NEXT: andi a2, a2, 255 +; CHECK-NEXT: sllw a5, a5, a4 +; CHECK-NEXT: sllw a1, a1, a4 ; CHECK-NEXT: sllw a2, a2, a4 ; CHECK-NEXT: .LBB0_3: # %do_cmpxchg ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/pr68855.ll b/llvm/test/CodeGen/RISCV/pr68855.ll index e9d1f6c2d1b2c..8031bf4f30411 100644 --- a/llvm/test/CodeGen/RISCV/pr68855.ll +++ b/llvm/test/CodeGen/RISCV/pr68855.ll @@ -6,10 +6,10 @@ define i16 @narrow_load(ptr %p1, ptr %p2) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lhu a2, 0(a0) ; CHECK-NEXT: lui a3, 2 -; CHECK-NEXT: addiw a3, a3, -1 -; CHECK-NEXT: xor a2, a2, a3 ; CHECK-NEXT: lui a4, 16 +; CHECK-NEXT: addiw a3, a3, -1 ; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: xor a2, a2, a3 ; CHECK-NEXT: xor a4, a3, a4 ; CHECK-NEXT: or a2, a2, a4 ; CHECK-NEXT: sw a2, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index 69746e3e70bfc..9fc9a3c42867e 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -7,21 +7,21 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-LABEL: test: ; NOREMAT: # %bb.0: -; NOREMAT-NEXT: addi sp, sp, -400 -; NOREMAT-NEXT: .cfi_def_cfa_offset 400 -; NOREMAT-NEXT: sd ra, 392(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s0, 384(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s1, 376(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s2, 368(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s3, 360(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s4, 352(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s5, 344(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s6, 336(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s7, 328(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s8, 320(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s9, 312(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s10, 304(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s11, 296(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addi sp, sp, -752 +; NOREMAT-NEXT: .cfi_def_cfa_offset 752 +; NOREMAT-NEXT: sd ra, 744(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s0, 736(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s1, 728(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s2, 720(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s3, 712(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s4, 704(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s5, 696(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s6, 688(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s7, 680(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s8, 672(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s9, 664(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s10, 656(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s11, 648(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: .cfi_offset ra, -8 ; NOREMAT-NEXT: .cfi_offset s0, -16 ; NOREMAT-NEXT: .cfi_offset s1, -24 @@ -36,747 +36,845 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: .cfi_offset s10, -96 ; NOREMAT-NEXT: .cfi_offset s11, -104 ; NOREMAT-NEXT: csrr a2, vlenb -; NOREMAT-NEXT: li a3, 6 -; NOREMAT-NEXT: mul a2, a2, a3 +; NOREMAT-NEXT: slli a2, a2, 1 ; NOREMAT-NEXT: sub sp, sp, a2 -; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x03, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 400 + 6 * vlenb -; NOREMAT-NEXT: li a2, 32 -; NOREMAT-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; NOREMAT-NEXT: vle32.v v8, (a0) -; NOREMAT-NEXT: addi a2, a0, 512 -; NOREMAT-NEXT: vle32.v v10, (a2) -; NOREMAT-NEXT: addi a2, a0, 1024 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: addi a2, a0, 1536 -; NOREMAT-NEXT: vle32.v v14, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a2) -; NOREMAT-NEXT: li a2, 1 -; NOREMAT-NEXT: slli a2, a2, 11 -; NOREMAT-NEXT: sd a2, 272(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: li a5, 5 -; NOREMAT-NEXT: slli a2, a5, 9 -; NOREMAT-NEXT: sd a2, 264(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v14, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a2) -; NOREMAT-NEXT: li a2, 3 -; NOREMAT-NEXT: slli a3, a2, 10 -; NOREMAT-NEXT: sd a3, 256(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 -; NOREMAT-NEXT: vle32.v v12, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: li a4, 7 -; NOREMAT-NEXT: slli a3, a4, 9 -; NOREMAT-NEXT: sd a3, 248(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 -; NOREMAT-NEXT: vle32.v v14, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a3) -; NOREMAT-NEXT: lui a3, 1 -; NOREMAT-NEXT: add a3, a0, a3 -; NOREMAT-NEXT: vle32.v v12, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: li a3, 9 -; NOREMAT-NEXT: slli a6, a3, 9 -; NOREMAT-NEXT: sd a6, 240(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v14, (a6) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a6) -; NOREMAT-NEXT: slli a6, a5, 10 -; NOREMAT-NEXT: sd a6, 232(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v12, (a6) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a6) -; NOREMAT-NEXT: li s8, 11 -; NOREMAT-NEXT: slli a6, s8, 9 -; NOREMAT-NEXT: sd a6, 224(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v14, (a6) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a6) -; NOREMAT-NEXT: slli a2, a2, 11 -; NOREMAT-NEXT: sd a2, 216(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: li s2, 13 -; NOREMAT-NEXT: slli a2, s2, 9 -; NOREMAT-NEXT: sd a2, 208(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v14, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v10, (a2) -; NOREMAT-NEXT: slli a2, a4, 10 -; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a0, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: li a2, 15 -; NOREMAT-NEXT: slli a6, a2, 9 -; NOREMAT-NEXT: sd a6, 192(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v26, (a6) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v16, (a6) -; NOREMAT-NEXT: lui a6, 2 -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v28, (a6) -; NOREMAT-NEXT: vle32.v v10, (a6) -; NOREMAT-NEXT: li a6, 17 -; NOREMAT-NEXT: slli a6, a6, 9 -; NOREMAT-NEXT: sd a6, 184(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t0, 17 -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v30, (a6) -; NOREMAT-NEXT: vle32.v v18, (a6) -; NOREMAT-NEXT: slli a6, a3, 10 -; NOREMAT-NEXT: sd a6, 176(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v6, (a6) -; NOREMAT-NEXT: vle32.v v20, (a6) -; NOREMAT-NEXT: li a6, 19 -; NOREMAT-NEXT: slli a6, a6, 9 -; NOREMAT-NEXT: sd a6, 168(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li a7, 19 -; NOREMAT-NEXT: add a6, a0, a6 -; NOREMAT-NEXT: vle32.v v4, (a6) -; NOREMAT-NEXT: vle32.v v22, (a6) -; NOREMAT-NEXT: slli a5, a5, 11 -; NOREMAT-NEXT: sd a5, 160(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v2, (a5) -; NOREMAT-NEXT: vle32.v v12, (a5) -; NOREMAT-NEXT: li s10, 21 -; NOREMAT-NEXT: slli a5, s10, 9 -; NOREMAT-NEXT: sd a5, 152(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v24, (a5) -; NOREMAT-NEXT: vle32.v v14, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v26 -; NOREMAT-NEXT: slli a5, s8, 10 -; NOREMAT-NEXT: sd a5, 144(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v26, (a5) +; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb +; NOREMAT-NEXT: mv a7, a0 +; NOREMAT-NEXT: li a0, 32 +; NOREMAT-NEXT: addi a5, a7, 512 +; NOREMAT-NEXT: addi a4, a7, 1024 +; NOREMAT-NEXT: addi a6, a7, 1536 +; NOREMAT-NEXT: li t4, 1 +; NOREMAT-NEXT: li a2, 5 +; NOREMAT-NEXT: li t1, 3 +; NOREMAT-NEXT: li t0, 7 +; NOREMAT-NEXT: lui t5, 1 +; NOREMAT-NEXT: li s4, 9 +; NOREMAT-NEXT: li s6, 11 +; NOREMAT-NEXT: li s9, 13 +; NOREMAT-NEXT: li ra, 15 +; NOREMAT-NEXT: lui t2, 2 +; NOREMAT-NEXT: lui s1, 3 +; NOREMAT-NEXT: lui t3, 4 +; NOREMAT-NEXT: lui s0, 5 +; NOREMAT-NEXT: lui s3, 6 +; NOREMAT-NEXT: lui s7, 7 +; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOREMAT-NEXT: slli t4, t4, 11 +; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli a3, a2, 9 +; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t6, t1, 10 +; NOREMAT-NEXT: slli s2, t0, 9 +; NOREMAT-NEXT: add a0, a7, t5 +; NOREMAT-NEXT: lui s11, 1 +; NOREMAT-NEXT: slli s4, s4, 9 +; NOREMAT-NEXT: slli s5, a2, 10 +; NOREMAT-NEXT: slli s6, s6, 9 +; NOREMAT-NEXT: slli s8, t1, 11 ; NOREMAT-NEXT: vle32.v v8, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v28 -; NOREMAT-NEXT: li s6, 23 -; NOREMAT-NEXT: slli a5, s6, 9 -; NOREMAT-NEXT: sd a5, 136(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v28, (a5) -; NOREMAT-NEXT: vle32.v v16, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v30 -; NOREMAT-NEXT: lui a5, 3 -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v30, (a5) -; NOREMAT-NEXT: vle32.v v10, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v6 -; NOREMAT-NEXT: li s3, 25 -; NOREMAT-NEXT: slli a5, s3, 9 -; NOREMAT-NEXT: sd a5, 128(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v6, (a5) -; NOREMAT-NEXT: vle32.v v18, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v4 -; NOREMAT-NEXT: slli a5, s2, 10 -; NOREMAT-NEXT: sd a5, 120(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v4, (a5) -; NOREMAT-NEXT: vle32.v v20, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v2 -; NOREMAT-NEXT: li t5, 27 -; NOREMAT-NEXT: slli a5, t5, 9 -; NOREMAT-NEXT: sd a5, 112(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a0, a5 -; NOREMAT-NEXT: vle32.v v2, (a5) -; NOREMAT-NEXT: vle32.v v22, (a5) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v24 -; NOREMAT-NEXT: slli a4, a4, 11 -; NOREMAT-NEXT: sd a4, 104(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 -; NOREMAT-NEXT: vle32.v v24, (a4) -; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v26 -; NOREMAT-NEXT: li t2, 29 -; NOREMAT-NEXT: slli a4, t2, 9 -; NOREMAT-NEXT: sd a4, 96(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: slli s9, s9, 9 +; NOREMAT-NEXT: li t5, 13 +; NOREMAT-NEXT: vle32.v v10, (a4) +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: slli s10, t0, 10 +; NOREMAT-NEXT: vle32.v v0, (a6) +; NOREMAT-NEXT: vle32.v v12, (a6) +; NOREMAT-NEXT: slli ra, ra, 9 +; NOREMAT-NEXT: vle32.v v4, (a0) +; NOREMAT-NEXT: vle32.v v20, (a0) +; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: vle32.v v6, (a4) +; NOREMAT-NEXT: vle32.v v30, (a4) +; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) +; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: vle32.v v24, (a4) +; NOREMAT-NEXT: vle32.v v22, (a4) +; NOREMAT-NEXT: add a4, a7, s0 +; NOREMAT-NEXT: vle32.v v14, (a7) +; NOREMAT-NEXT: vle32.v v18, (a4) +; NOREMAT-NEXT: vle32.v v16, (a4) +; NOREMAT-NEXT: add a4, a7, s3 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v28 -; NOREMAT-NEXT: slli a4, a2, 10 -; NOREMAT-NEXT: sd a4, 88(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 -; NOREMAT-NEXT: vle32.v v28, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: csrr a4, vlenb -; NOREMAT-NEXT: slli a4, a4, 2 -; NOREMAT-NEXT: add a4, sp, a4 -; NOREMAT-NEXT: addi a4, a4, 288 -; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v30 -; NOREMAT-NEXT: li a5, 31 -; NOREMAT-NEXT: slli a4, a5, 9 -; NOREMAT-NEXT: sd a4, 80(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 -; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v6 -; NOREMAT-NEXT: lui a6, 4 -; NOREMAT-NEXT: add a4, a0, a6 -; NOREMAT-NEXT: vle32.v v6, (a4) +; NOREMAT-NEXT: addi a0, sp, 640 +; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; NOREMAT-NEXT: add a4, a7, t4 +; NOREMAT-NEXT: vle32.v v10, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: add a4, a7, a3 +; NOREMAT-NEXT: vle32.v v0, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 +; NOREMAT-NEXT: vle32.v v10, (a4) +; NOREMAT-NEXT: add a4, a7, t6 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: add a4, a7, s2 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: add a4, a7, s7 +; NOREMAT-NEXT: vle32.v v0, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 +; NOREMAT-NEXT: vle32.v v10, (a4) +; NOREMAT-NEXT: add a4, a7, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: csrr a4, vlenb -; NOREMAT-NEXT: slli a4, a4, 1 -; NOREMAT-NEXT: add a4, sp, a4 -; NOREMAT-NEXT: addi a4, a4, 288 -; NOREMAT-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v4 -; NOREMAT-NEXT: addiw a4, a6, 512 -; NOREMAT-NEXT: sd a4, 72(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: add a4, a7, s5 ; NOREMAT-NEXT: vle32.v v4, (a4) -; NOREMAT-NEXT: vle32.v v18, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v2 -; NOREMAT-NEXT: slli a4, t0, 10 -; NOREMAT-NEXT: sd a4, 64(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 -; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: add a4, a7, s6 ; NOREMAT-NEXT: vle32.v v20, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v24 -; NOREMAT-NEXT: addiw a4, a6, 1536 -; NOREMAT-NEXT: sd a4, 56(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a0, a4 -; NOREMAT-NEXT: vle32.v v0, (a4) -; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v26 -; NOREMAT-NEXT: slli a3, a3, 11 -; NOREMAT-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: add a4, a7, s8 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: add a4, a7, s9 +; NOREMAT-NEXT: vle32.v v20, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: add a4, a7, s10 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; NOREMAT-NEXT: lui t4, 8 +; NOREMAT-NEXT: add a5, a7, t4 +; NOREMAT-NEXT: vle32.v v20, (a5) +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 +; NOREMAT-NEXT: li a4, 17 +; NOREMAT-NEXT: slli a4, a4, 9 +; NOREMAT-NEXT: li s1, 17 +; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 +; NOREMAT-NEXT: li a5, 9 +; NOREMAT-NEXT: slli a4, a5, 10 +; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: vle32.v v6, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 +; NOREMAT-NEXT: li a4, 19 +; NOREMAT-NEXT: slli a4, a4, 9 +; NOREMAT-NEXT: li t2, 19 +; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: vle32.v v30, (a4) +; NOREMAT-NEXT: slli a3, a2, 11 +; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) +; NOREMAT-NEXT: vle32.v v4, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 +; NOREMAT-NEXT: li s7, 21 +; NOREMAT-NEXT: slli a3, s7, 9 +; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: addi a3, sp, 288 -; NOREMAT-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v28 -; NOREMAT-NEXT: lui s1, 5 -; NOREMAT-NEXT: addiw a3, s1, -1536 -; NOREMAT-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 -; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: vle32.v v24, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 2 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v30 -; NOREMAT-NEXT: slli a3, a7, 10 -; NOREMAT-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 -; NOREMAT-NEXT: vle32.v v10, (a3) -; NOREMAT-NEXT: vle32.v v14, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v6 -; NOREMAT-NEXT: addiw a3, s1, -512 -; NOREMAT-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v6, (a3) -; NOREMAT-NEXT: vle32.v v16, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 1 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vl2r.v v26, (a3) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v4 -; NOREMAT-NEXT: add a3, a0, s1 -; NOREMAT-NEXT: vle32.v v26, (a3) -; NOREMAT-NEXT: vle32.v v28, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 2 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vs2r.v v28, (a3) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v2 -; NOREMAT-NEXT: addiw ra, s1, 512 -; NOREMAT-NEXT: add a3, a0, ra -; NOREMAT-NEXT: vle32.v v28, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 +; NOREMAT-NEXT: li a6, 11 +; NOREMAT-NEXT: slli a3, a6, 10 +; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v0 -; NOREMAT-NEXT: slli s11, s10, 10 -; NOREMAT-NEXT: add a3, a0, s11 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 +; NOREMAT-NEXT: li s3, 23 +; NOREMAT-NEXT: slli a3, s3, 9 +; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: vle32.v v18, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v12 -; NOREMAT-NEXT: addiw s10, s1, 1536 -; NOREMAT-NEXT: add a3, a0, s10 -; NOREMAT-NEXT: vle32.v v2, (a3) -; NOREMAT-NEXT: vle32.v v20, (a3) -; NOREMAT-NEXT: addi a3, sp, 288 -; NOREMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v8 -; NOREMAT-NEXT: slli s9, s8, 11 -; NOREMAT-NEXT: add a3, a0, s9 -; NOREMAT-NEXT: vle32.v v0, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 +; NOREMAT-NEXT: li s0, 25 +; NOREMAT-NEXT: slli a3, s0, 9 +; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v10 -; NOREMAT-NEXT: lui t0, 6 -; NOREMAT-NEXT: addiw s8, t0, -1536 -; NOREMAT-NEXT: add a3, a0, s8 -; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: vle32.v v22, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v6 -; NOREMAT-NEXT: slli s7, s6, 10 -; NOREMAT-NEXT: add a3, a0, s7 -; NOREMAT-NEXT: vle32.v v10, (a3) -; NOREMAT-NEXT: vle32.v v14, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v26 -; NOREMAT-NEXT: addiw s6, t0, -512 -; NOREMAT-NEXT: add a3, a0, s6 ; NOREMAT-NEXT: vle32.v v6, (a3) -; NOREMAT-NEXT: vle32.v v16, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 2 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vl2r.v v24, (a3) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v28 -; NOREMAT-NEXT: add a3, a0, t0 -; NOREMAT-NEXT: vle32.v v24, (a3) -; NOREMAT-NEXT: vle32.v v26, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 2 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vs2r.v v26, (a3) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v4 -; NOREMAT-NEXT: addiw s5, t0, 512 -; NOREMAT-NEXT: add a3, a0, s5 -; NOREMAT-NEXT: vle32.v v26, (a3) -; NOREMAT-NEXT: vle32.v v28, (a3) -; NOREMAT-NEXT: csrr a3, vlenb -; NOREMAT-NEXT: slli a3, a3, 1 -; NOREMAT-NEXT: add a3, sp, a3 -; NOREMAT-NEXT: addi a3, a3, 288 -; NOREMAT-NEXT: vs2r.v v28, (a3) # Unknown-size Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v2 -; NOREMAT-NEXT: slli s4, s3, 10 -; NOREMAT-NEXT: add a3, a0, s4 -; NOREMAT-NEXT: vle32.v v28, (a3) -; NOREMAT-NEXT: vle32.v v18, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v0 -; NOREMAT-NEXT: addiw s3, t0, 1536 -; NOREMAT-NEXT: add a3, a0, s3 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 +; NOREMAT-NEXT: slli a3, t5, 10 +; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) -; NOREMAT-NEXT: vle32.v v20, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v8 -; NOREMAT-NEXT: slli s2, s2, 11 -; NOREMAT-NEXT: add a3, a0, s2 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 +; NOREMAT-NEXT: li t3, 27 +; NOREMAT-NEXT: slli a3, t3, 9 +; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: vle32.v v12, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v10 -; NOREMAT-NEXT: lui a3, 7 -; NOREMAT-NEXT: addiw s0, a3, -1536 -; NOREMAT-NEXT: add a4, a0, s0 -; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v6 -; NOREMAT-NEXT: slli t6, t5, 10 -; NOREMAT-NEXT: add a4, a0, t6 -; NOREMAT-NEXT: vle32.v v0, (a4) -; NOREMAT-NEXT: vle32.v v14, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 -; NOREMAT-NEXT: addiw t5, a3, -512 -; NOREMAT-NEXT: add a4, a0, t5 -; NOREMAT-NEXT: vle32.v v6, (a4) -; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: csrr a4, vlenb -; NOREMAT-NEXT: slli a4, a4, 2 -; NOREMAT-NEXT: add a4, sp, a4 -; NOREMAT-NEXT: addi a4, a4, 288 -; NOREMAT-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v26 -; NOREMAT-NEXT: add a4, a0, a3 -; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: csrr a4, vlenb -; NOREMAT-NEXT: slli a4, a4, 1 -; NOREMAT-NEXT: add a4, sp, a4 -; NOREMAT-NEXT: addi a4, a4, 288 -; NOREMAT-NEXT: vl2r.v v10, (a4) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v28 -; NOREMAT-NEXT: addiw t4, a3, 512 -; NOREMAT-NEXT: add a4, a0, t4 -; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: vle32.v v24, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v30 -; NOREMAT-NEXT: slli t3, t2, 10 -; NOREMAT-NEXT: add a4, a0, t3 -; NOREMAT-NEXT: vle32.v v18, (a4) -; NOREMAT-NEXT: vle32.v v28, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v4 -; NOREMAT-NEXT: addiw t2, a3, 1536 -; NOREMAT-NEXT: add a4, a0, t2 -; NOREMAT-NEXT: vle32.v v20, (a4) -; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v2 -; NOREMAT-NEXT: slli t1, a2, 11 -; NOREMAT-NEXT: add a2, a0, t1 +; NOREMAT-NEXT: slli a2, t0, 11 +; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 +; NOREMAT-NEXT: li t0, 29 +; NOREMAT-NEXT: slli a2, t0, 9 +; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 +; NOREMAT-NEXT: li a3, 15 +; NOREMAT-NEXT: slli a2, a3, 10 +; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 +; NOREMAT-NEXT: li t1, 31 +; NOREMAT-NEXT: slli a2, t1, 9 +; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v0 -; NOREMAT-NEXT: lui a2, 8 -; NOREMAT-NEXT: addiw a7, a2, -1536 -; NOREMAT-NEXT: add a4, a0, a7 -; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v6 -; NOREMAT-NEXT: slli a6, a5, 10 -; NOREMAT-NEXT: add a4, a0, a6 -; NOREMAT-NEXT: vle32.v v14, (a4) -; NOREMAT-NEXT: vle32.v v6, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v26 -; NOREMAT-NEXT: addiw a5, a2, -512 -; NOREMAT-NEXT: add a4, a0, a5 -; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a0, a0, a2 -; NOREMAT-NEXT: vle32.v v0, (a0) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v18 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v20 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 +; NOREMAT-NEXT: lui a4, 4 +; NOREMAT-NEXT: addiw a0, a4, 512 +; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: vle32.v v8, (a0) +; NOREMAT-NEXT: vle32.v v26, (a0) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 +; NOREMAT-NEXT: slli a2, s1, 10 +; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 +; NOREMAT-NEXT: addiw a2, a4, 1536 +; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: slli a2, a5, 11 +; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 +; NOREMAT-NEXT: lui a5, 5 +; NOREMAT-NEXT: addiw a2, a5, -1536 +; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 +; NOREMAT-NEXT: slli a2, t2, 10 +; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li t2, 19 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 +; NOREMAT-NEXT: addiw a2, a5, -512 +; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 +; NOREMAT-NEXT: addiw a2, a5, 512 +; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: slli a2, s7, 10 +; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 +; NOREMAT-NEXT: addiw a2, a5, 1536 +; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: slli a2, a6, 11 +; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 +; NOREMAT-NEXT: lui a6, 6 +; NOREMAT-NEXT: addiw a2, a6, -1536 +; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: slli a2, s3, 10 +; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v16, (a2) +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 +; NOREMAT-NEXT: addiw a2, a6, -512 +; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v14 +; NOREMAT-NEXT: addiw a2, a6, 512 +; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: slli a2, s0, 10 +; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 +; NOREMAT-NEXT: addiw a2, a6, 1536 +; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: slli a2, t5, 11 +; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v0 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v16, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 +; NOREMAT-NEXT: lui s0, 7 +; NOREMAT-NEXT: addiw a2, s0, -1536 +; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: slli a2, t3, 10 +; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v14, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: addi a0, sp, 640 +; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 +; NOREMAT-NEXT: addiw a2, s0, -512 +; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 +; NOREMAT-NEXT: addiw a2, s0, 512 +; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui t3, 7 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: slli a2, t0, 10 +; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 +; NOREMAT-NEXT: addiw a2, t3, 1536 +; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v16, (a2) +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: slli a2, a3, 11 +; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 +; NOREMAT-NEXT: addiw a2, t4, -1536 +; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v14, (a2) +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: slli a2, t1, 10 +; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: addiw a0, t4, -512 +; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 +; NOREMAT-NEXT: vle32.v v12, (a0) +; NOREMAT-NEXT: vle32.v v0, (a0) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v26 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v18 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v16 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v8 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v14 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v22 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v20 ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: addi a0, a1, 1024 ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: add s11, a1, s11 +; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 2 +; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 3 +; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a4, a1, a4 +; NOREMAT-NEXT: sd a4, 248(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a5, a1, a5 +; NOREMAT-NEXT: sd a5, 240(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a6, a1, a6 +; NOREMAT-NEXT: sd a6, 232(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add t3, a1, t3 +; NOREMAT-NEXT: sd t3, 224(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a1, t4 +; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, t4, 512 +; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, t4, 1024 +; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, t4, 1536 +; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli s1, s1, 11 +; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 9 +; NOREMAT-NEXT: addiw a2, a0, -1536 +; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a2, a0, -1024 +; NOREMAT-NEXT: sd a2, 72(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a2, a0, -512 +; NOREMAT-NEXT: sd a2, 40(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a1, a0 +; NOREMAT-NEXT: sd a2, 208(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw s11, a0, 512 +; NOREMAT-NEXT: addiw s7, a0, 1024 +; NOREMAT-NEXT: addiw s3, a0, 1536 +; NOREMAT-NEXT: slli s1, t2, 11 +; NOREMAT-NEXT: lui a0, 10 +; NOREMAT-NEXT: addiw t2, a0, -1536 +; NOREMAT-NEXT: addiw a7, a0, -1024 +; NOREMAT-NEXT: addiw a4, a0, -512 +; NOREMAT-NEXT: add a2, a1, a0 +; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, a0, 512 +; NOREMAT-NEXT: ld a2, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a2, a1, a2 +; NOREMAT-NEXT: ld a3, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a3, a1, a3 +; NOREMAT-NEXT: add a5, a1, t6 +; NOREMAT-NEXT: add a6, a1, s2 +; NOREMAT-NEXT: add t0, a1, s4 +; NOREMAT-NEXT: add t1, a1, s5 +; NOREMAT-NEXT: add t3, a1, s6 +; NOREMAT-NEXT: add t4, a1, s8 +; NOREMAT-NEXT: add t5, a1, s9 +; NOREMAT-NEXT: add t6, a1, s10 +; NOREMAT-NEXT: add s0, a1, ra +; NOREMAT-NEXT: ld s2, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s2, a1, s2 +; NOREMAT-NEXT: ld s4, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s4, a1, s4 +; NOREMAT-NEXT: ld s5, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s5, a1, s5 +; NOREMAT-NEXT: ld s6, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s6, a1, s6 +; NOREMAT-NEXT: ld s8, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s8, a1, s8 +; NOREMAT-NEXT: ld s9, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s9, a1, s9 +; NOREMAT-NEXT: ld s10, 576(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s10, a1, s10 +; NOREMAT-NEXT: ld ra, 568(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 96(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 488(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 480(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 112(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 472(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 464(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 456(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 144(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 448(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 152(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 440(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 168(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 432(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 424(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 424(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 416(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 432(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 408(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 440(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 400(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 448(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 392(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 456(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 384(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 464(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 472(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 368(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 480(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 360(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 488(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 352(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 344(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 336(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 328(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 320(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 312(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 304(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 296(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 288(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 280(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 192(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 576(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 176(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 160(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 592(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 128(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add ra, a1, s11 +; NOREMAT-NEXT: add s11, a1, s7 +; NOREMAT-NEXT: add s7, a1, s3 +; NOREMAT-NEXT: add s3, a1, s1 +; NOREMAT-NEXT: add s1, a1, t2 +; NOREMAT-NEXT: add t2, a1, a7 +; NOREMAT-NEXT: add a7, a1, a4 +; NOREMAT-NEXT: add a4, a1, a0 ; NOREMAT-NEXT: addi a0, a1, 1536 +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a2) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a3) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a5) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a6) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 272(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t0) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t1) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t3) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t4) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t5) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (t6) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (s0) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 264(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 256(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: vse32.v v8, (s2) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 248(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: vse32.v v8, (s4) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: lui a0, 1 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: vse32.v v8, (s5) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 240(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: vse32.v v8, (s6) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 232(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: vse32.v v8, (s8) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (s9) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (s10) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: ld a0, 256(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 224(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 216(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 24(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 208(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 200(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 192(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 56(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: lui a0, 2 -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 64(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 184(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 80(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 176(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 248(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 168(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 96(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 160(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 104(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 152(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 112(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 144(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 120(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 136(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: lui a0, 3 -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 144(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 128(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 152(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 120(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 240(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 112(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 168(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 104(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 184(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 96(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 424(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 88(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 432(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 80(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 440(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: lui a0, 4 -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 448(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 72(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 456(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 64(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 232(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 56(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 464(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 472(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 480(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 488(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: ld a0, 496(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s1, a1, s1 -; NOREMAT-NEXT: vse32.v v8, (s1) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: vse32.v v8, (ra) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s11, a1, s11 -; NOREMAT-NEXT: vse32.v v8, (s11) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s10, a1, s10 -; NOREMAT-NEXT: vse32.v v8, (s10) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s9, a1, s9 -; NOREMAT-NEXT: vse32.v v8, (s9) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s8, a1, s8 -; NOREMAT-NEXT: vse32.v v8, (s8) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s7, a1, s7 -; NOREMAT-NEXT: vse32.v v8, (s7) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s6, a1, s6 -; NOREMAT-NEXT: vse32.v v8, (s6) +; NOREMAT-NEXT: ld a0, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t0, a1, t0 -; NOREMAT-NEXT: vse32.v v8, (t0) +; NOREMAT-NEXT: ld a0, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s5, a1, s5 -; NOREMAT-NEXT: vse32.v v8, (s5) +; NOREMAT-NEXT: ld a0, 224(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s4, a1, s4 -; NOREMAT-NEXT: vse32.v v8, (s4) +; NOREMAT-NEXT: ld a0, 520(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s3, a1, s3 -; NOREMAT-NEXT: vse32.v v8, (s3) +; NOREMAT-NEXT: ld a0, 528(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s2, a1, s2 -; NOREMAT-NEXT: vse32.v v8, (s2) +; NOREMAT-NEXT: ld a0, 536(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add s0, a1, s0 -; NOREMAT-NEXT: vse32.v v8, (s0) +; NOREMAT-NEXT: ld a0, 544(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t6, a1, t6 -; NOREMAT-NEXT: vse32.v v8, (t6) +; NOREMAT-NEXT: ld a0, 552(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t5, a1, t5 -; NOREMAT-NEXT: vse32.v v8, (t5) +; NOREMAT-NEXT: ld a0, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add a3, a1, a3 -; NOREMAT-NEXT: vse32.v v8, (a3) +; NOREMAT-NEXT: ld a0, 568(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t4, a1, t4 -; NOREMAT-NEXT: vse32.v v8, (t4) +; NOREMAT-NEXT: ld a0, 216(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t3, a1, t3 -; NOREMAT-NEXT: vse32.v v8, (t3) +; NOREMAT-NEXT: ld a0, 576(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t2, a1, t2 -; NOREMAT-NEXT: vse32.v v8, (t2) +; NOREMAT-NEXT: ld a0, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add t1, a1, t1 -; NOREMAT-NEXT: vse32.v v8, (t1) +; NOREMAT-NEXT: ld a0, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add a7, a1, a7 -; NOREMAT-NEXT: vse32.v v8, (a7) +; NOREMAT-NEXT: ld a0, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add a6, a1, a6 -; NOREMAT-NEXT: vse32.v v8, (a6) +; NOREMAT-NEXT: ld a0, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: add a5, a1, a5 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a5) -; NOREMAT-NEXT: add a0, a1, a2 +; NOREMAT-NEXT: ld a0, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a0) -; NOREMAT-NEXT: addiw a0, a2, 512 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 +; NOREMAT-NEXT: ld a0, 624(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: addiw a0, a2, 1024 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: vse32.v v10, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: addiw a0, a2, 1536 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 +; NOREMAT-NEXT: ld a0, 208(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: li a0, 17 -; NOREMAT-NEXT: slli a0, a0, 11 -; NOREMAT-NEXT: add a0, a1, a0 ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a0) -; NOREMAT-NEXT: lui a0, 9 -; NOREMAT-NEXT: addiw a2, a0, -1536 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a2) -; NOREMAT-NEXT: addiw a2, a0, -1024 -; NOREMAT-NEXT: add a2, a1, a2 +; NOREMAT-NEXT: vse32.v v8, (ra) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a2) -; NOREMAT-NEXT: addiw a2, a0, -512 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a2) -; NOREMAT-NEXT: add a2, a1, a0 +; NOREMAT-NEXT: vse32.v v8, (s11) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a2) -; NOREMAT-NEXT: addiw a2, a0, 512 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a2) -; NOREMAT-NEXT: addiw a2, a0, 1024 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: vse32.v v10, (a2) +; NOREMAT-NEXT: vse32.v v8, (s7) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: addiw a0, a0, 1536 -; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: li a0, 19 -; NOREMAT-NEXT: slli a0, a0, 11 -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: vse32.v v8, (s3) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a0) -; NOREMAT-NEXT: lui a0, 10 -; NOREMAT-NEXT: addiw a2, a0, -1536 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a2) -; NOREMAT-NEXT: addiw a2, a0, -1024 -; NOREMAT-NEXT: add a2, a1, a2 +; NOREMAT-NEXT: vse32.v v8, (s1) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: vse32.v v10, (a2) -; NOREMAT-NEXT: addiw a2, a0, -512 -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; NOREMAT-NEXT: vse32.v v8, (a2) -; NOREMAT-NEXT: add a2, a1, a0 -; NOREMAT-NEXT: vse32.v v10, (a2) -; NOREMAT-NEXT: addiw a0, a0, 512 -; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: vse32.v v8, (t2) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a7) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: ld a0, 200(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: vse32.v v8, (a4) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: csrr a0, vlenb -; NOREMAT-NEXT: li a1, 6 -; NOREMAT-NEXT: mul a0, a0, a1 +; NOREMAT-NEXT: slli a0, a0, 1 ; NOREMAT-NEXT: add sp, sp, a0 -; NOREMAT-NEXT: .cfi_def_cfa sp, 400 -; NOREMAT-NEXT: ld ra, 392(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s0, 384(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s1, 376(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s2, 368(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s3, 360(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s4, 352(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s5, 344(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s6, 336(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s7, 328(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s8, 320(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s9, 312(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s10, 304(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s11, 296(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: .cfi_def_cfa sp, 752 +; NOREMAT-NEXT: ld ra, 744(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s0, 736(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s1, 728(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s2, 720(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s3, 712(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 704(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 696(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 688(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s7, 680(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 672(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 664(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s10, 656(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s11, 648(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: .cfi_restore ra ; NOREMAT-NEXT: .cfi_restore s0 ; NOREMAT-NEXT: .cfi_restore s1 @@ -790,27 +888,27 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: .cfi_restore s9 ; NOREMAT-NEXT: .cfi_restore s10 ; NOREMAT-NEXT: .cfi_restore s11 -; NOREMAT-NEXT: addi sp, sp, 400 +; NOREMAT-NEXT: addi sp, sp, 752 ; NOREMAT-NEXT: .cfi_def_cfa_offset 0 ; NOREMAT-NEXT: ret ; ; REMAT-LABEL: test: ; REMAT: # %bb.0: -; REMAT-NEXT: addi sp, sp, -112 -; REMAT-NEXT: .cfi_def_cfa_offset 112 -; REMAT-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; REMAT-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; REMAT-NEXT: addi sp, sp, -544 +; REMAT-NEXT: .cfi_def_cfa_offset 544 +; REMAT-NEXT: sd ra, 536(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s0, 528(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s1, 520(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s2, 512(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s3, 504(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s4, 496(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s5, 488(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s6, 480(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s7, 472(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s8, 464(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s9, 456(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s10, 448(sp) # 8-byte Folded Spill +; REMAT-NEXT: sd s11, 440(sp) # 8-byte Folded Spill ; REMAT-NEXT: .cfi_offset ra, -8 ; REMAT-NEXT: .cfi_offset s0, -16 ; REMAT-NEXT: .cfi_offset s1, -24 @@ -824,730 +922,980 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_offset s9, -88 ; REMAT-NEXT: .cfi_offset s10, -96 ; REMAT-NEXT: .cfi_offset s11, -104 -; REMAT-NEXT: li a2, 32 -; REMAT-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; REMAT-NEXT: vle32.v v8, (a0) -; REMAT-NEXT: addi a2, a0, 512 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: addi a2, a0, 1024 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 -; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 18 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: sub sp, sp, a2 +; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb +; REMAT-NEXT: li a4, 32 +; REMAT-NEXT: addi a5, a0, 512 +; REMAT-NEXT: addi a3, a0, 1024 ; REMAT-NEXT: addi a2, a0, 1536 +; REMAT-NEXT: li a6, 1 +; REMAT-NEXT: slli a6, a6, 11 +; REMAT-NEXT: li a7, 5 +; REMAT-NEXT: slli a7, a7, 9 +; REMAT-NEXT: li t0, 3 +; REMAT-NEXT: slli t0, t0, 10 +; REMAT-NEXT: li t1, 7 +; REMAT-NEXT: slli t1, t1, 9 +; REMAT-NEXT: lui t2, 1 +; REMAT-NEXT: li t3, 9 +; REMAT-NEXT: slli t3, t3, 9 +; REMAT-NEXT: li t4, 5 +; REMAT-NEXT: slli t4, t4, 10 +; REMAT-NEXT: li t5, 11 +; REMAT-NEXT: slli t5, t5, 9 +; REMAT-NEXT: li t6, 3 +; REMAT-NEXT: slli t6, t6, 11 +; REMAT-NEXT: li s0, 13 +; REMAT-NEXT: slli s0, s0, 9 +; REMAT-NEXT: li s1, 7 +; REMAT-NEXT: slli s1, s1, 10 +; REMAT-NEXT: li s2, 15 +; REMAT-NEXT: slli s2, s2, 9 +; REMAT-NEXT: lui s3, 2 +; REMAT-NEXT: li s4, 17 +; REMAT-NEXT: slli s4, s4, 9 +; REMAT-NEXT: li s5, 9 +; REMAT-NEXT: slli s5, s5, 10 +; REMAT-NEXT: li s6, 19 +; REMAT-NEXT: slli s6, s6, 9 +; REMAT-NEXT: li s7, 5 +; REMAT-NEXT: slli s7, s7, 11 +; REMAT-NEXT: li s8, 21 +; REMAT-NEXT: slli s8, s8, 9 +; REMAT-NEXT: li s9, 11 +; REMAT-NEXT: slli s9, s9, 10 +; REMAT-NEXT: li s10, 23 +; REMAT-NEXT: slli s10, s10, 9 +; REMAT-NEXT: lui s11, 3 +; REMAT-NEXT: li ra, 25 +; REMAT-NEXT: slli ra, ra, 9 +; REMAT-NEXT: vsetvli zero, a4, e32, m2, ta, ma +; REMAT-NEXT: vle32.v v8, (a5) +; REMAT-NEXT: li a4, 13 +; REMAT-NEXT: slli a4, a4, 10 +; REMAT-NEXT: vle32.v v10, (a3) +; REMAT-NEXT: vle32.v v12, (a3) +; REMAT-NEXT: li a3, 27 +; REMAT-NEXT: slli a3, a3, 9 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 4 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, t2 +; REMAT-NEXT: vle32.v v4, (a0) +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a5, 14 +; REMAT-NEXT: mul a2, a2, a5 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, t3 +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v8 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: add a2, a0, t4 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 1 -; REMAT-NEXT: slli a2, a2, 11 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 5 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: add a2, a0, t5 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 3 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 7 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 1 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a5, 12 +; REMAT-NEXT: mul a2, a2, a5 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: add a2, a0, s0 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: add a2, a0, s1 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: add a2, a0, s2 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 9 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 5 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: csrr a5, vlenb +; REMAT-NEXT: slli a5, a5, 4 +; REMAT-NEXT: add a5, sp, a5 +; REMAT-NEXT: addi a5, a5, 432 +; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: add a2, a0, s3 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 11 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 +; REMAT-NEXT: csrr a5, vlenb +; REMAT-NEXT: li a6, 14 +; REMAT-NEXT: mul a5, a5, a6 +; REMAT-NEXT: add a5, sp, a5 +; REMAT-NEXT: addi a5, a5, 432 +; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: add a2, a0, s5 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 3 -; REMAT-NEXT: slli a2, a2, 11 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 13 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: add a2, a0, s6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 7 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: csrr a5, vlenb +; REMAT-NEXT: li a6, 12 +; REMAT-NEXT: mul a5, a5, a6 +; REMAT-NEXT: add a5, sp, a5 +; REMAT-NEXT: addi a5, a5, 432 +; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: add a2, a0, s8 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: add a2, a0, s9 +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: add a2, a0, s10 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 15 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: lui a2, 2 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 17 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 9 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v12, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, s11 ; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 19 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: add a2, a0, ra +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 5 -; REMAT-NEXT: slli a2, a2, 11 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: li a2, 21 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v12 +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 1 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, a4 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: li a2, 11 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: li a2, 23 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: lui a2, 3 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: li a2, 25 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: li a2, 13 -; REMAT-NEXT: slli a2, a2, 10 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li a2, 27 -; REMAT-NEXT: slli a2, a2, 9 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: li a2, 7 -; REMAT-NEXT: slli a2, a2, 11 -; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v26 +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 2 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, a3 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 4 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: li a5, 7 +; REMAT-NEXT: slli a5, a5, 11 +; REMAT-NEXT: add a2, a0, a5 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: addi a3, sp, 432 +; REMAT-NEXT: vs2r.v v18, (a3) # Unknown-size Folded Spill +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 14 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: li a2, 29 ; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v28 +; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v26, v24 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 12 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: li a2, 15 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v14, v30 +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 10 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: li a2, 31 ; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v6 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v12 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v4 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v2 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a3, 6 +; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v2 +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 1 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: li a2, 17 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 -; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 2 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 +; REMAT-NEXT: vle32.v v22, (a2) ; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: addiw a2, a2, 1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: addi a3, sp, 432 +; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: li a2, 9 ; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 14 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v18 +; REMAT-NEXT: vle32.v v10, (a2) ; REMAT-NEXT: lui a2, 5 ; REMAT-NEXT: addiw a2, a2, -1536 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: vle32.v v10, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 12 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: li a2, 19 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 10 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v14, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 -; REMAT-NEXT: lui ra, 5 -; REMAT-NEXT: addiw ra, ra, -512 -; REMAT-NEXT: add a2, a0, ra -; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 -; REMAT-NEXT: lui s11, 5 -; REMAT-NEXT: add a2, a0, s11 -; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: csrr a3, vlenb +; REMAT-NEXT: li a4, 6 +; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: add a3, sp, a3 +; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 -; REMAT-NEXT: lui s10, 5 -; REMAT-NEXT: addiw s10, s10, 512 -; REMAT-NEXT: add a2, a0, s10 -; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v24 -; REMAT-NEXT: li s9, 21 -; REMAT-NEXT: slli s9, s9, 10 -; REMAT-NEXT: add a2, a0, s9 -; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 512 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v26 -; REMAT-NEXT: lui s8, 5 -; REMAT-NEXT: addiw s8, s8, 1536 -; REMAT-NEXT: add a2, a0, s8 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v28 -; REMAT-NEXT: li s7, 11 -; REMAT-NEXT: slli s7, s7, 11 +; REMAT-NEXT: li s7, 21 +; REMAT-NEXT: slli s7, s7, 10 ; REMAT-NEXT: add a2, a0, s7 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: lui s4, 5 +; REMAT-NEXT: addiw s4, s4, 1536 +; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: vle32.v v24, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v30 -; REMAT-NEXT: lui s6, 6 -; REMAT-NEXT: addiw s6, s6, -1536 -; REMAT-NEXT: add a2, a0, s6 -; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: li a2, 11 +; REMAT-NEXT: slli a2, a2, 11 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v6 -; REMAT-NEXT: li s5, 23 -; REMAT-NEXT: slli s5, s5, 10 -; REMAT-NEXT: add a2, a0, s5 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v14, v4 -; REMAT-NEXT: lui s4, 6 -; REMAT-NEXT: addiw s4, s4, -512 -; REMAT-NEXT: add a2, a0, s4 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v2 ; REMAT-NEXT: lui s3, 6 +; REMAT-NEXT: addiw s3, s3, -1536 ; REMAT-NEXT: add a2, a0, s3 -; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v24 -; REMAT-NEXT: lui s2, 6 -; REMAT-NEXT: addiw s2, s2, 512 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 +; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: li s2, 23 +; REMAT-NEXT: slli s2, s2, 10 ; REMAT-NEXT: add a2, a0, s2 -; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: lui a2, 6 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui s1, 6 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v26 -; REMAT-NEXT: li s1, 25 -; REMAT-NEXT: slli s1, s1, 10 -; REMAT-NEXT: add a2, a0, s1 -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v28 ; REMAT-NEXT: lui s0, 6 -; REMAT-NEXT: addiw s0, s0, 1536 +; REMAT-NEXT: addiw s0, s0, 512 ; REMAT-NEXT: add a2, a0, s0 -; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: vle32.v v20, (a2) +; REMAT-NEXT: li a2, 25 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v30 -; REMAT-NEXT: li t6, 13 -; REMAT-NEXT: slli t6, t6, 11 +; REMAT-NEXT: lui t6, 6 +; REMAT-NEXT: addiw t6, t6, 1536 ; REMAT-NEXT: add a2, a0, t6 -; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v6 -; REMAT-NEXT: lui t5, 7 -; REMAT-NEXT: addiw t5, t5, -1536 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: li t5, 13 +; REMAT-NEXT: slli t5, t5, 11 ; REMAT-NEXT: add a2, a0, t5 -; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -1536 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: li t4, 27 ; REMAT-NEXT: slli t4, t4, 10 ; REMAT-NEXT: add a2, a0, t4 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v14, v2 -; REMAT-NEXT: lui t3, 7 -; REMAT-NEXT: addiw t3, t3, -512 -; REMAT-NEXT: add a2, a0, t3 -; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v0 -; REMAT-NEXT: lui t2, 7 -; REMAT-NEXT: add a2, a0, t2 -; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v26 -; REMAT-NEXT: lui t1, 7 -; REMAT-NEXT: addiw t1, t1, 512 -; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: addiw a2, a2, -512 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: lui a2, 7 +; REMAT-NEXT: add a2, a0, a2 +; REMAT-NEXT: lui t3, 7 +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v28 -; REMAT-NEXT: li t0, 29 -; REMAT-NEXT: slli t0, t0, 10 -; REMAT-NEXT: add a2, a0, t0 +; REMAT-NEXT: lui t2, 7 +; REMAT-NEXT: addiw t2, t2, 512 +; REMAT-NEXT: add a2, a0, t2 +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v22, v30 -; REMAT-NEXT: lui a7, 7 -; REMAT-NEXT: addiw a7, a7, 1536 -; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: li t1, 29 +; REMAT-NEXT: slli t1, t1, 10 +; REMAT-NEXT: add a2, a0, t1 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v24, v6 -; REMAT-NEXT: li a6, 15 -; REMAT-NEXT: slli a6, a6, 11 -; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: lui t0, 7 +; REMAT-NEXT: addiw t0, t0, 1536 +; REMAT-NEXT: add a2, a0, t0 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v10, v4 -; REMAT-NEXT: lui a5, 8 -; REMAT-NEXT: addiw a5, a5, -1536 -; REMAT-NEXT: add a2, a0, a5 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: vle32.v v8, (a2) +; REMAT-NEXT: li a7, 15 +; REMAT-NEXT: slli a7, a7, 11 +; REMAT-NEXT: add a2, a0, a7 +; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 +; REMAT-NEXT: lui a6, 8 +; REMAT-NEXT: addiw a6, a6, -1536 +; REMAT-NEXT: add a2, a0, a6 +; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: li a4, 31 ; REMAT-NEXT: slli a4, a4, 10 ; REMAT-NEXT: add a2, a0, a4 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v14, v0 +; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 +; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: lui a3, 8 ; REMAT-NEXT: addiw a3, a3, -512 ; REMAT-NEXT: add a2, a0, a3 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: lui a2, 8 ; REMAT-NEXT: add a0, a0, a2 -; REMAT-NEXT: vle32.v v0, (a0) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16 -; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20 -; REMAT-NEXT: sf.vc.vv 3, 0, v26, v22 -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v24 -; REMAT-NEXT: sf.vc.vv 3, 0, v30, v10 -; REMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 -; REMAT-NEXT: sf.vc.vv 3, 0, v2, v0 +; REMAT-NEXT: vle32.v v4, (a0) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v2 +; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v28 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v30 +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v6 +; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: addi a0, a1, 1024 ; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: addi a0, a1, 1536 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: li a0, 1 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 416(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 5 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 408(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 3 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 400(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 7 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 392(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 1 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 384(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 9 ; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 5 -; REMAT-NEXT: slli a0, a0, 10 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 11 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 3 -; REMAT-NEXT: slli a0, a0, 11 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 13 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 7 -; REMAT-NEXT: slli a0, a0, 10 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 15 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: lui a0, 2 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 17 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 9 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 376(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 5 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 19 +; REMAT-NEXT: sd a0, 368(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 5 +; REMAT-NEXT: sd a0, 360(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 3 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 21 +; REMAT-NEXT: sd a0, 352(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 13 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 11 +; REMAT-NEXT: sd a0, 344(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 7 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 23 +; REMAT-NEXT: sd a0, 336(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 15 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: lui a0, 3 +; REMAT-NEXT: sd a0, 328(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 2 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 25 +; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 17 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s5, a1, s5 +; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s6, a1, s6 +; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 5 +; REMAT-NEXT: slli a0, a0, 11 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 288(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s8, a1, s8 +; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s9, a1, s9 +; REMAT-NEXT: sd s9, 272(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s10, a1, s10 +; REMAT-NEXT: sd s10, 264(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s11, a1, s11 +; REMAT-NEXT: sd s11, 256(sp) # 8-byte Folded Spill +; REMAT-NEXT: add ra, a1, ra +; REMAT-NEXT: sd ra, 248(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 13 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 27 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: li a0, 7 -; REMAT-NEXT: slli a0, a0, 11 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 232(sp) # 8-byte Folded Spill +; REMAT-NEXT: add a5, a1, a5 +; REMAT-NEXT: sd a5, 224(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 29 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 15 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 208(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 31 ; REMAT-NEXT: slli a0, a0, 9 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 200(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 4 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 4 ; REMAT-NEXT: addiw a0, a0, 512 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 184(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 17 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 4 ; REMAT-NEXT: addiw a0, a0, 1536 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 168(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 9 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill ; REMAT-NEXT: lui a0, 5 ; REMAT-NEXT: addiw a0, a0, -1536 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 152(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 19 ; REMAT-NEXT: slli a0, a0, 10 ; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: sd a0, 144(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: addiw a0, a0, -512 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 136(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 128(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: addiw a0, a0, 512 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 120(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s7, a1, s7 +; REMAT-NEXT: sd s7, 112(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s4, a1, s4 +; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 11 +; REMAT-NEXT: slli a0, a0, 11 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 96(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s3, a1, s3 +; REMAT-NEXT: sd s3, 88(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s2, a1, s2 +; REMAT-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 6 +; REMAT-NEXT: addiw a0, a0, -512 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 72(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s1, a1, s1 +; REMAT-NEXT: sd s1, 64(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s0, a1, s0 +; REMAT-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 25 +; REMAT-NEXT: slli a0, a0, 10 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; REMAT-NEXT: add t6, a1, t6 +; REMAT-NEXT: sd t6, 40(sp) # 8-byte Folded Spill +; REMAT-NEXT: add t5, a1, t5 +; REMAT-NEXT: sd t5, 32(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 7 +; REMAT-NEXT: addiw a0, a0, -1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 24(sp) # 8-byte Folded Spill +; REMAT-NEXT: add t4, a1, t4 +; REMAT-NEXT: sd t4, 16(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui ra, 7 +; REMAT-NEXT: addiw ra, ra, -512 ; REMAT-NEXT: add ra, a1, ra +; REMAT-NEXT: add s11, a1, t3 +; REMAT-NEXT: add s10, a1, t2 +; REMAT-NEXT: add s9, a1, t1 +; REMAT-NEXT: add s8, a1, t0 +; REMAT-NEXT: add s7, a1, a7 +; REMAT-NEXT: add s6, a1, a6 +; REMAT-NEXT: add s5, a1, a4 +; REMAT-NEXT: add s4, a1, a3 +; REMAT-NEXT: add s3, a1, a2 +; REMAT-NEXT: lui s2, 8 +; REMAT-NEXT: addiw s2, s2, 512 +; REMAT-NEXT: add s2, a1, s2 +; REMAT-NEXT: lui s1, 8 +; REMAT-NEXT: addiw s1, s1, 1024 +; REMAT-NEXT: add s1, a1, s1 +; REMAT-NEXT: lui s0, 8 +; REMAT-NEXT: addiw s0, s0, 1536 +; REMAT-NEXT: add s0, a1, s0 +; REMAT-NEXT: li t6, 17 +; REMAT-NEXT: slli t6, t6, 11 +; REMAT-NEXT: add t6, a1, t6 +; REMAT-NEXT: lui t5, 9 +; REMAT-NEXT: addiw t5, t5, -1536 +; REMAT-NEXT: add t5, a1, t5 +; REMAT-NEXT: lui t4, 9 +; REMAT-NEXT: addiw t4, t4, -1024 +; REMAT-NEXT: add t4, a1, t4 +; REMAT-NEXT: lui t3, 9 +; REMAT-NEXT: addiw t3, t3, -512 +; REMAT-NEXT: add t3, a1, t3 +; REMAT-NEXT: lui t2, 9 +; REMAT-NEXT: add t2, a1, t2 +; REMAT-NEXT: lui t1, 9 +; REMAT-NEXT: addiw t1, t1, 512 +; REMAT-NEXT: add t1, a1, t1 +; REMAT-NEXT: lui t0, 9 +; REMAT-NEXT: addiw t0, t0, 1024 +; REMAT-NEXT: add t0, a1, t0 +; REMAT-NEXT: lui a7, 9 +; REMAT-NEXT: addiw a7, a7, 1536 +; REMAT-NEXT: add a7, a1, a7 +; REMAT-NEXT: li a6, 19 +; REMAT-NEXT: slli a6, a6, 11 +; REMAT-NEXT: add a6, a1, a6 +; REMAT-NEXT: lui a5, 10 +; REMAT-NEXT: addiw a5, a5, -1536 +; REMAT-NEXT: add a5, a1, a5 +; REMAT-NEXT: lui a4, 10 +; REMAT-NEXT: addiw a4, a4, -1024 +; REMAT-NEXT: add a4, a1, a4 +; REMAT-NEXT: lui a3, 10 +; REMAT-NEXT: addiw a3, a3, -512 +; REMAT-NEXT: add a3, a1, a3 +; REMAT-NEXT: lui a2, 10 +; REMAT-NEXT: add a2, a1, a2 +; REMAT-NEXT: lui a0, 10 +; REMAT-NEXT: addiw a0, a0, 512 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: addi a1, a1, 1536 +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 416(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 408(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 400(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 392(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 384(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 376(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 368(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 360(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 352(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 344(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 336(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 328(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 320(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 312(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 304(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 296(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 288(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 280(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 272(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 264(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 256(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 248(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 240(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 232(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 224(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 216(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 208(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 200(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 192(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 184(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 176(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 168(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 160(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 152(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 144(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 136(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 128(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 120(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 112(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 104(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 96(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 88(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 80(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 72(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 64(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 56(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 48(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 40(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 32(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 24(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; REMAT-NEXT: ld a1, 16(sp) # 8-byte Folded Reload +; REMAT-NEXT: vse32.v v8, (a1) +; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: vse32.v v8, (ra) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s11, a1, s11 ; REMAT-NEXT: vse32.v v8, (s11) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s10, a1, s10 ; REMAT-NEXT: vse32.v v8, (s10) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s9, a1, s9 ; REMAT-NEXT: vse32.v v8, (s9) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s8, a1, s8 ; REMAT-NEXT: vse32.v v8, (s8) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s7, a1, s7 ; REMAT-NEXT: vse32.v v8, (s7) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s6, a1, s6 ; REMAT-NEXT: vse32.v v8, (s6) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s5, a1, s5 ; REMAT-NEXT: vse32.v v8, (s5) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s4, a1, s4 ; REMAT-NEXT: vse32.v v8, (s4) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s3, a1, s3 ; REMAT-NEXT: vse32.v v8, (s3) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s2, a1, s2 ; REMAT-NEXT: vse32.v v8, (s2) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s1, a1, s1 ; REMAT-NEXT: vse32.v v8, (s1) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add s0, a1, s0 ; REMAT-NEXT: vse32.v v8, (s0) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t6, a1, t6 ; REMAT-NEXT: vse32.v v8, (t6) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t5, a1, t5 ; REMAT-NEXT: vse32.v v8, (t5) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t4, a1, t4 ; REMAT-NEXT: vse32.v v8, (t4) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t3, a1, t3 ; REMAT-NEXT: vse32.v v8, (t3) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t2, a1, t2 ; REMAT-NEXT: vse32.v v8, (t2) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t1, a1, t1 ; REMAT-NEXT: vse32.v v8, (t1) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add t0, a1, t0 ; REMAT-NEXT: vse32.v v8, (t0) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a7, a1, a7 ; REMAT-NEXT: vse32.v v8, (a7) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a6, a1, a6 ; REMAT-NEXT: vse32.v v8, (a6) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a5, a1, a5 ; REMAT-NEXT: vse32.v v8, (a5) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a4, a1, a4 ; REMAT-NEXT: vse32.v v8, (a4) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a3, a1, a3 ; REMAT-NEXT: vse32.v v8, (a3) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: add a2, a1, a2 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 ; REMAT-NEXT: vse32.v v8, (a2) -; REMAT-NEXT: lui a0, 8 -; REMAT-NEXT: addiw a0, a0, 512 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 8 -; REMAT-NEXT: addiw a0, a0, 1024 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 8 -; REMAT-NEXT: addiw a0, a0, 1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: li a0, 17 -; REMAT-NEXT: slli a0, a0, 11 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, -1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, -1024 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, -512 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, 512 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, 1024 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 9 -; REMAT-NEXT: addiw a0, a0, 1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: li a0, 19 -; REMAT-NEXT: slli a0, a0, 11 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 10 -; REMAT-NEXT: addiw a0, a0, -1536 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 10 -; REMAT-NEXT: addiw a0, a0, -1024 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v10, 0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 10 -; REMAT-NEXT: addiw a0, a0, -512 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: vse32.v v10, (a0) -; REMAT-NEXT: lui a0, 10 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: vse32.v v8, (a0) -; REMAT-NEXT: lui a0, 10 -; REMAT-NEXT: addiw a0, a0, 512 -; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: vse32.v v8, (a0) ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; REMAT-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; REMAT-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; REMAT-NEXT: csrr a0, vlenb +; REMAT-NEXT: li a1, 18 +; REMAT-NEXT: mul a0, a0, a1 +; REMAT-NEXT: add sp, sp, a0 +; REMAT-NEXT: .cfi_def_cfa sp, 544 +; REMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s0, 528(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s1, 520(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s2, 512(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s3, 504(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s4, 496(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s5, 488(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s6, 480(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s7, 472(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s8, 464(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s9, 456(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s10, 448(sp) # 8-byte Folded Reload +; REMAT-NEXT: ld s11, 440(sp) # 8-byte Folded Reload ; REMAT-NEXT: .cfi_restore ra ; REMAT-NEXT: .cfi_restore s0 ; REMAT-NEXT: .cfi_restore s1 @@ -1561,7 +1909,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_restore s9 ; REMAT-NEXT: .cfi_restore s10 ; REMAT-NEXT: .cfi_restore s11 -; REMAT-NEXT: addi sp, sp, 112 +; REMAT-NEXT: addi sp, sp, 544 ; REMAT-NEXT: .cfi_def_cfa_offset 0 ; REMAT-NEXT: ret %4 = tail call i64 @llvm.riscv.vsetvli.i64(i64 32, i64 2, i64 1) diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll index f9b9c8a69d431..b1bba5fdc9211 100644 --- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll +++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll @@ -8,13 +8,13 @@ define i1 @pr84653(i32 %x) { ; CHECK-NOZBB-LABEL: pr84653: ; CHECK-NOZBB: # %bb.0: ; CHECK-NOZBB-NEXT: sext.w a1, a0 -; CHECK-NOZBB-NEXT: sgtz a2, a1 -; CHECK-NOZBB-NEXT: lui a3, 524288 -; CHECK-NOZBB-NEXT: addi a3, a3, -1 -; CHECK-NOZBB-NEXT: xor a0, a0, a3 +; CHECK-NOZBB-NEXT: lui a2, 524288 +; CHECK-NOZBB-NEXT: sgtz a3, a1 +; CHECK-NOZBB-NEXT: addi a2, a2, -1 +; CHECK-NOZBB-NEXT: xor a0, a0, a2 ; CHECK-NOZBB-NEXT: sext.w a0, a0 ; CHECK-NOZBB-NEXT: slt a0, a0, a1 -; CHECK-NOZBB-NEXT: and a0, a2, a0 +; CHECK-NOZBB-NEXT: and a0, a3, a0 ; CHECK-NOZBB-NEXT: ret ; ; CHECK-ZBB-LABEL: pr84653: @@ -69,13 +69,13 @@ define i1 @select_to_or(i32 %x) { ; CHECK-NOZBB-LABEL: select_to_or: ; CHECK-NOZBB: # %bb.0: ; CHECK-NOZBB-NEXT: sext.w a1, a0 -; CHECK-NOZBB-NEXT: sgtz a2, a1 -; CHECK-NOZBB-NEXT: lui a3, 524288 -; CHECK-NOZBB-NEXT: addi a3, a3, -1 -; CHECK-NOZBB-NEXT: xor a0, a0, a3 +; CHECK-NOZBB-NEXT: lui a2, 524288 +; CHECK-NOZBB-NEXT: sgtz a3, a1 +; CHECK-NOZBB-NEXT: addi a2, a2, -1 +; CHECK-NOZBB-NEXT: xor a0, a0, a2 ; CHECK-NOZBB-NEXT: sext.w a0, a0 ; CHECK-NOZBB-NEXT: slt a0, a0, a1 -; CHECK-NOZBB-NEXT: or a0, a2, a0 +; CHECK-NOZBB-NEXT: or a0, a3, a0 ; CHECK-NOZBB-NEXT: ret ; ; CHECK-ZBB-LABEL: select_to_or: diff --git a/llvm/test/CodeGen/RISCV/pr95271.ll b/llvm/test/CodeGen/RISCV/pr95271.ll index 950e6fb5f37ec..aa941cb803627 100644 --- a/llvm/test/CodeGen/RISCV/pr95271.ll +++ b/llvm/test/CodeGen/RISCV/pr95271.ll @@ -6,22 +6,22 @@ define i32 @PR95271(ptr %p) { ; RV32I-LABEL: PR95271: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lui a1, 349525 +; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: addi a0, a0, 1 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -33,23 +33,23 @@ define i32 @PR95271(ptr %p) { ; RV64I-LABEL: PR95271: ; RV64I: # %bb.0: ; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: addiw a1, a0, 1 -; RV64I-NEXT: addi a0, a0, 1 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addi a2, a0, 1 +; RV64I-NEXT: srli a2, a2, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a0, a0, 1 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: and a1, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: sub a1, a1, a0 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: and a2, a1, a0 -; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll index dd270fa12183e..fb0c11e930b3b 100644 --- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll +++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll @@ -24,6 +24,8 @@ define void @last_chance_recoloring_failure() { ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb ; CHECK-NEXT: li a0, 55 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vloxseg2ei32.v v16, (a1), v8 ; CHECK-NEXT: csrr a0, vlenb @@ -35,8 +37,6 @@ define void @last_chance_recoloring_failure() { ; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs4r.v v20, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li s0, 36 ; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma ; CHECK-NEXT: vfwadd.vv v16, v8, v12, v0.t @@ -84,6 +84,8 @@ define void @last_chance_recoloring_failure() { ; SUBREGLIVENESS-NEXT: sub sp, sp, a0 ; SUBREGLIVENESS-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb ; SUBREGLIVENESS-NEXT: li a0, 55 +; SUBREGLIVENESS-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; SUBREGLIVENESS-NEXT: vmclr.m v0 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; SUBREGLIVENESS-NEXT: vloxseg2ei32.v v16, (a1), v8 ; SUBREGLIVENESS-NEXT: csrr a0, vlenb @@ -95,8 +97,6 @@ define void @last_chance_recoloring_failure() { ; SUBREGLIVENESS-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill ; SUBREGLIVENESS-NEXT: add a0, a0, a1 ; SUBREGLIVENESS-NEXT: vs4r.v v20, (a0) # Unknown-size Folded Spill -; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; SUBREGLIVENESS-NEXT: vmclr.m v0 ; SUBREGLIVENESS-NEXT: li s0, 36 ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma ; SUBREGLIVENESS-NEXT: vfwadd.vv v16, v8, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rem.ll b/llvm/test/CodeGen/RISCV/rem.ll index 5b27c4129df6a..2001262008237 100644 --- a/llvm/test/CodeGen/RISCV/rem.ll +++ b/llvm/test/CodeGen/RISCV/rem.ll @@ -23,8 +23,8 @@ define i32 @urem(i32 %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: srli a1, a1, 32 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -452,8 +452,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -463,8 +463,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind { ; RV32IM-LABEL: srem8: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a1, 24 -; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a1, a1, 24 ; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: rem a0, a0, a1 ; RV32IM-NEXT: ret @@ -474,8 +474,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -485,8 +485,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind { ; RV64IM-LABEL: srem8: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a1, 56 -; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: remw a0, a0, a1 ; RV64IM-NEXT: ret @@ -637,8 +637,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -648,8 +648,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind { ; RV32IM-LABEL: srem16: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a1, 16 -; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a1, a1, 16 ; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: rem a0, a0, a1 ; RV32IM-NEXT: ret @@ -659,8 +659,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -670,8 +670,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind { ; RV64IM-LABEL: srem16: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a1, 48 -; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a1, a1, 48 ; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: remw a0, a0, a1 ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll index 352184c2d85ad..32261ee47164e 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -64,11 +64,11 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lw a5, -4(a4) ; CHECK-NEXT: lw a6, 0(a4) +; CHECK-NEXT: addi a3, a3, 2 ; CHECK-NEXT: addi a5, a5, 4 ; CHECK-NEXT: addi a6, a6, 4 ; CHECK-NEXT: sw a5, -4(a4) ; CHECK-NEXT: sw a6, 0(a4) -; CHECK-NEXT: addi a3, a3, 2 ; CHECK-NEXT: addi a4, a4, 8 ; CHECK-NEXT: bne a1, a3, .LBB1_4 ; CHECK-NEXT: .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa diff --git a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll index 4901e268ec11a..c1e7b682200eb 100644 --- a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll +++ b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll @@ -6,13 +6,13 @@ define void @test(ptr nocapture noundef writeonly %array1, i32 noundef signext % ; RV64-LABEL: test: ; RV64: # %bb.0: # %entry ; RV64-NEXT: addiw a3, a1, 5 -; RV64-NEXT: slli a4, a3, 2 -; RV64-NEXT: add a4, a0, a4 ; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: sw a2, 0(a4) -; RV64-NEXT: sw a2, 24(a0) -; RV64-NEXT: sw a3, 140(a0) +; RV64-NEXT: slli a4, a3, 2 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: sw a2, 0(a0) +; RV64-NEXT: sw a2, 24(a1) +; RV64-NEXT: sw a3, 140(a1) ; RV64-NEXT: ret entry: %add = add nsw i32 %a, 5 @@ -70,13 +70,13 @@ define void @test2(ptr nocapture noundef writeonly %array1, i64 noundef %a, i64 ; RV64-LABEL: test2: ; RV64: # %bb.0: # %entry ; RV64-NEXT: addi a3, a1, 5 -; RV64-NEXT: slli a4, a3, 3 -; RV64-NEXT: add a4, a0, a4 ; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: sd a2, 0(a4) -; RV64-NEXT: sd a2, 48(a0) -; RV64-NEXT: sd a3, 280(a0) +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: sd a2, 0(a0) +; RV64-NEXT: sd a2, 48(a1) +; RV64-NEXT: sd a3, 280(a1) ; RV64-NEXT: ret entry: %add = add nsw i64 %a, 5 @@ -101,8 +101,8 @@ define void @test3(ptr nocapture noundef %array1, i64 noundef %a, i64 noundef %b ; RV64-NEXT: mv a5, a2 ; RV64-NEXT: .LBB3_2: # %entry ; RV64-NEXT: slli a2, a4, 3 -; RV64-NEXT: add a2, a0, a2 ; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a2, a0, a2 ; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: sd a5, 0(a2) ; RV64-NEXT: sd a5, 48(a0) diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll index d907a37c2b3d1..634cca5dcdb71 100644 --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -133,10 +133,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: or a3, a3, a6 ; RV32I-NEXT: .LBB2_3: ; RV32I-NEXT: srai a5, a5, 31 -; RV32I-NEXT: and a4, a5, a4 ; RV32I-NEXT: neg a7, a2 -; RV32I-NEXT: li a5, 32 -; RV32I-NEXT: sub a5, a5, a2 +; RV32I-NEXT: li a6, 32 +; RV32I-NEXT: and a4, a5, a4 +; RV32I-NEXT: sub a5, a6, a2 ; RV32I-NEXT: srl a6, a1, a7 ; RV32I-NEXT: bltz a5, .LBB2_5 ; RV32I-NEXT: # %bb.4: @@ -181,10 +181,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: or a3, a3, a6 ; RV32ZBB-NEXT: .LBB2_3: ; RV32ZBB-NEXT: srai a5, a5, 31 -; RV32ZBB-NEXT: and a4, a5, a4 ; RV32ZBB-NEXT: neg a7, a2 -; RV32ZBB-NEXT: li a5, 32 -; RV32ZBB-NEXT: sub a5, a5, a2 +; RV32ZBB-NEXT: li a6, 32 +; RV32ZBB-NEXT: and a4, a5, a4 +; RV32ZBB-NEXT: sub a5, a6, a2 ; RV32ZBB-NEXT: srl a6, a1, a7 ; RV32ZBB-NEXT: bltz a5, .LBB2_5 ; RV32ZBB-NEXT: # %bb.4: @@ -226,10 +226,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: or a3, a3, a6 ; RV32XTHEADBB-NEXT: .LBB2_3: ; RV32XTHEADBB-NEXT: srai a5, a5, 31 -; RV32XTHEADBB-NEXT: and a4, a5, a4 ; RV32XTHEADBB-NEXT: neg a7, a2 -; RV32XTHEADBB-NEXT: li a5, 32 -; RV32XTHEADBB-NEXT: sub a5, a5, a2 +; RV32XTHEADBB-NEXT: li a6, 32 +; RV32XTHEADBB-NEXT: and a4, a5, a4 +; RV32XTHEADBB-NEXT: sub a5, a6, a2 ; RV32XTHEADBB-NEXT: srl a6, a1, a7 ; RV32XTHEADBB-NEXT: bltz a5, .LBB2_5 ; RV32XTHEADBB-NEXT: # %bb.4: @@ -281,10 +281,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: or a3, a3, a6 ; RV32I-NEXT: .LBB3_3: ; RV32I-NEXT: srai a5, a5, 31 -; RV32I-NEXT: and a4, a5, a4 ; RV32I-NEXT: neg a7, a2 -; RV32I-NEXT: li a5, 32 -; RV32I-NEXT: sub a5, a5, a2 +; RV32I-NEXT: li a6, 32 +; RV32I-NEXT: and a4, a5, a4 +; RV32I-NEXT: sub a5, a6, a2 ; RV32I-NEXT: sll a6, a0, a7 ; RV32I-NEXT: bltz a5, .LBB3_5 ; RV32I-NEXT: # %bb.4: @@ -329,10 +329,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: or a3, a3, a6 ; RV32ZBB-NEXT: .LBB3_3: ; RV32ZBB-NEXT: srai a5, a5, 31 -; RV32ZBB-NEXT: and a4, a5, a4 ; RV32ZBB-NEXT: neg a7, a2 -; RV32ZBB-NEXT: li a5, 32 -; RV32ZBB-NEXT: sub a5, a5, a2 +; RV32ZBB-NEXT: li a6, 32 +; RV32ZBB-NEXT: and a4, a5, a4 +; RV32ZBB-NEXT: sub a5, a6, a2 ; RV32ZBB-NEXT: sll a6, a0, a7 ; RV32ZBB-NEXT: bltz a5, .LBB3_5 ; RV32ZBB-NEXT: # %bb.4: @@ -374,10 +374,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: or a3, a3, a6 ; RV32XTHEADBB-NEXT: .LBB3_3: ; RV32XTHEADBB-NEXT: srai a5, a5, 31 -; RV32XTHEADBB-NEXT: and a4, a5, a4 ; RV32XTHEADBB-NEXT: neg a7, a2 -; RV32XTHEADBB-NEXT: li a5, 32 -; RV32XTHEADBB-NEXT: sub a5, a5, a2 +; RV32XTHEADBB-NEXT: li a6, 32 +; RV32XTHEADBB-NEXT: and a4, a5, a4 +; RV32XTHEADBB-NEXT: sub a5, a6, a2 ; RV32XTHEADBB-NEXT: sll a6, a0, a7 ; RV32XTHEADBB-NEXT: bltz a5, .LBB3_5 ; RV32XTHEADBB-NEXT: # %bb.4: @@ -1442,45 +1442,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32I-LABEL: rotl_64_mask_shared: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a5, a4, 26 -; RV32I-NEXT: srli a5, a5, 31 -; RV32I-NEXT: mv a7, a0 -; RV32I-NEXT: bnez a5, .LBB17_2 +; RV32I-NEXT: srli t0, a5, 31 +; RV32I-NEXT: mv a6, a0 +; RV32I-NEXT: bnez t0, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: mv a6, a1 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: andi a6, a4, 63 -; RV32I-NEXT: sll t0, a7, a4 -; RV32I-NEXT: bnez a5, .LBB17_4 +; RV32I-NEXT: andi a5, a4, 63 +; RV32I-NEXT: sll a7, a6, a4 +; RV32I-NEXT: bnez t0, .LBB17_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: .LBB17_4: ; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: not t1, a4 -; RV32I-NEXT: srl a0, a0, t1 -; RV32I-NEXT: or a5, t0, a0 -; RV32I-NEXT: sll a1, a1, a4 -; RV32I-NEXT: srli a0, a7, 1 -; RV32I-NEXT: srl a7, a0, t1 -; RV32I-NEXT: addi a0, a6, -32 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: not t0, a4 +; RV32I-NEXT: sll t1, a1, a4 +; RV32I-NEXT: srli a1, a6, 1 +; RV32I-NEXT: srl a6, a0, t0 +; RV32I-NEXT: srl t0, a1, t0 +; RV32I-NEXT: addi a0, a5, -32 +; RV32I-NEXT: or a1, a7, a6 +; RV32I-NEXT: or a6, t1, t0 ; RV32I-NEXT: bltz a0, .LBB17_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: sll a3, a2, a6 +; RV32I-NEXT: sll a3, a2, a5 ; RV32I-NEXT: j .LBB17_7 ; RV32I-NEXT: .LBB17_6: ; RV32I-NEXT: sll a3, a3, a4 ; RV32I-NEXT: srli a7, a2, 1 -; RV32I-NEXT: not a6, a6 -; RV32I-NEXT: srl a6, a7, a6 -; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: not a5, a5 +; RV32I-NEXT: srl a5, a7, a5 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: .LBB17_7: ; RV32I-NEXT: sll a2, a2, a4 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: add a3, a5, a3 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a0, a6, a0 +; RV32I-NEXT: sltu a2, a0, a6 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: rotl_64_mask_shared: @@ -1496,45 +1496,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32ZBB-LABEL: rotl_64_mask_shared: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: slli a5, a4, 26 -; RV32ZBB-NEXT: srli a5, a5, 31 -; RV32ZBB-NEXT: mv a7, a0 -; RV32ZBB-NEXT: bnez a5, .LBB17_2 +; RV32ZBB-NEXT: srli t0, a5, 31 +; RV32ZBB-NEXT: mv a6, a0 +; RV32ZBB-NEXT: bnez t0, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: mv a7, a1 +; RV32ZBB-NEXT: mv a6, a1 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: andi a6, a4, 63 -; RV32ZBB-NEXT: sll t0, a7, a4 -; RV32ZBB-NEXT: bnez a5, .LBB17_4 +; RV32ZBB-NEXT: andi a5, a4, 63 +; RV32ZBB-NEXT: sll a7, a6, a4 +; RV32ZBB-NEXT: bnez t0, .LBB17_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv a1, a0 ; RV32ZBB-NEXT: .LBB17_4: ; RV32ZBB-NEXT: srli a0, a1, 1 -; RV32ZBB-NEXT: not t1, a4 -; RV32ZBB-NEXT: srl a0, a0, t1 -; RV32ZBB-NEXT: or a5, t0, a0 -; RV32ZBB-NEXT: sll a1, a1, a4 -; RV32ZBB-NEXT: srli a0, a7, 1 -; RV32ZBB-NEXT: srl a7, a0, t1 -; RV32ZBB-NEXT: addi a0, a6, -32 -; RV32ZBB-NEXT: or a1, a1, a7 +; RV32ZBB-NEXT: not t0, a4 +; RV32ZBB-NEXT: sll t1, a1, a4 +; RV32ZBB-NEXT: srli a1, a6, 1 +; RV32ZBB-NEXT: srl a6, a0, t0 +; RV32ZBB-NEXT: srl t0, a1, t0 +; RV32ZBB-NEXT: addi a0, a5, -32 +; RV32ZBB-NEXT: or a1, a7, a6 +; RV32ZBB-NEXT: or a6, t1, t0 ; RV32ZBB-NEXT: bltz a0, .LBB17_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: sll a3, a2, a6 +; RV32ZBB-NEXT: sll a3, a2, a5 ; RV32ZBB-NEXT: j .LBB17_7 ; RV32ZBB-NEXT: .LBB17_6: ; RV32ZBB-NEXT: sll a3, a3, a4 ; RV32ZBB-NEXT: srli a7, a2, 1 -; RV32ZBB-NEXT: not a6, a6 -; RV32ZBB-NEXT: srl a6, a7, a6 -; RV32ZBB-NEXT: or a3, a3, a6 +; RV32ZBB-NEXT: not a5, a5 +; RV32ZBB-NEXT: srl a5, a7, a5 +; RV32ZBB-NEXT: or a3, a3, a5 ; RV32ZBB-NEXT: .LBB17_7: ; RV32ZBB-NEXT: sll a2, a2, a4 ; RV32ZBB-NEXT: srai a0, a0, 31 ; RV32ZBB-NEXT: and a0, a0, a2 -; RV32ZBB-NEXT: add a0, a1, a0 -; RV32ZBB-NEXT: sltu a1, a0, a1 -; RV32ZBB-NEXT: add a3, a5, a3 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add a0, a6, a0 +; RV32ZBB-NEXT: sltu a2, a0, a6 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotl_64_mask_shared: @@ -1546,45 +1546,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; ; RV32XTHEADBB-LABEL: rotl_64_mask_shared: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: th.extu a5, a4, 5, 5 -; RV32XTHEADBB-NEXT: mv a7, a0 -; RV32XTHEADBB-NEXT: bnez a5, .LBB17_2 +; RV32XTHEADBB-NEXT: th.extu t0, a4, 5, 5 +; RV32XTHEADBB-NEXT: mv a6, a0 +; RV32XTHEADBB-NEXT: bnez t0, .LBB17_2 ; RV32XTHEADBB-NEXT: # %bb.1: -; RV32XTHEADBB-NEXT: mv a7, a1 +; RV32XTHEADBB-NEXT: mv a6, a1 ; RV32XTHEADBB-NEXT: .LBB17_2: -; RV32XTHEADBB-NEXT: andi a6, a4, 63 -; RV32XTHEADBB-NEXT: sll t0, a7, a4 -; RV32XTHEADBB-NEXT: bnez a5, .LBB17_4 +; RV32XTHEADBB-NEXT: andi a5, a4, 63 +; RV32XTHEADBB-NEXT: sll a7, a6, a4 +; RV32XTHEADBB-NEXT: bnez t0, .LBB17_4 ; RV32XTHEADBB-NEXT: # %bb.3: ; RV32XTHEADBB-NEXT: mv a1, a0 ; RV32XTHEADBB-NEXT: .LBB17_4: ; RV32XTHEADBB-NEXT: srli a0, a1, 1 -; RV32XTHEADBB-NEXT: not t1, a4 -; RV32XTHEADBB-NEXT: srl a0, a0, t1 -; RV32XTHEADBB-NEXT: or a5, t0, a0 -; RV32XTHEADBB-NEXT: sll a1, a1, a4 -; RV32XTHEADBB-NEXT: srli a0, a7, 1 -; RV32XTHEADBB-NEXT: srl a7, a0, t1 -; RV32XTHEADBB-NEXT: addi a0, a6, -32 -; RV32XTHEADBB-NEXT: or a1, a1, a7 +; RV32XTHEADBB-NEXT: not t0, a4 +; RV32XTHEADBB-NEXT: sll t1, a1, a4 +; RV32XTHEADBB-NEXT: srli a1, a6, 1 +; RV32XTHEADBB-NEXT: srl a6, a0, t0 +; RV32XTHEADBB-NEXT: srl t0, a1, t0 +; RV32XTHEADBB-NEXT: addi a0, a5, -32 +; RV32XTHEADBB-NEXT: or a1, a7, a6 +; RV32XTHEADBB-NEXT: or a6, t1, t0 ; RV32XTHEADBB-NEXT: bltz a0, .LBB17_6 ; RV32XTHEADBB-NEXT: # %bb.5: -; RV32XTHEADBB-NEXT: sll a3, a2, a6 +; RV32XTHEADBB-NEXT: sll a3, a2, a5 ; RV32XTHEADBB-NEXT: j .LBB17_7 ; RV32XTHEADBB-NEXT: .LBB17_6: ; RV32XTHEADBB-NEXT: sll a3, a3, a4 ; RV32XTHEADBB-NEXT: srli a7, a2, 1 -; RV32XTHEADBB-NEXT: not a6, a6 -; RV32XTHEADBB-NEXT: srl a6, a7, a6 -; RV32XTHEADBB-NEXT: or a3, a3, a6 +; RV32XTHEADBB-NEXT: not a5, a5 +; RV32XTHEADBB-NEXT: srl a5, a7, a5 +; RV32XTHEADBB-NEXT: or a3, a3, a5 ; RV32XTHEADBB-NEXT: .LBB17_7: ; RV32XTHEADBB-NEXT: sll a2, a2, a4 ; RV32XTHEADBB-NEXT: srai a0, a0, 31 ; RV32XTHEADBB-NEXT: and a0, a0, a2 -; RV32XTHEADBB-NEXT: add a0, a1, a0 -; RV32XTHEADBB-NEXT: sltu a1, a0, a1 -; RV32XTHEADBB-NEXT: add a3, a5, a3 -; RV32XTHEADBB-NEXT: add a1, a3, a1 +; RV32XTHEADBB-NEXT: add a0, a6, a0 +; RV32XTHEADBB-NEXT: sltu a2, a0, a6 +; RV32XTHEADBB-NEXT: add a1, a1, a3 +; RV32XTHEADBB-NEXT: add a1, a1, a2 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: rotl_64_mask_shared: @@ -1669,27 +1669,27 @@ declare i32 @llvm.fshr.i32(i32, i32, i32) define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind { ; RV32I-LABEL: rotr_64_mask_shared: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a7, a4, 32 +; RV32I-NEXT: andi t0, a4, 32 ; RV32I-NEXT: mv a6, a1 -; RV32I-NEXT: beqz a7, .LBB19_2 +; RV32I-NEXT: beqz t0, .LBB19_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a6, a0 ; RV32I-NEXT: .LBB19_2: ; RV32I-NEXT: andi a5, a4, 63 -; RV32I-NEXT: srl t0, a6, a4 -; RV32I-NEXT: beqz a7, .LBB19_4 +; RV32I-NEXT: srl a7, a6, a4 +; RV32I-NEXT: beqz t0, .LBB19_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB19_4: ; RV32I-NEXT: slli a1, a0, 1 -; RV32I-NEXT: not a7, a4 -; RV32I-NEXT: sll a1, a1, a7 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: srl t0, a0, a4 +; RV32I-NEXT: not t0, a4 +; RV32I-NEXT: srl t1, a0, a4 ; RV32I-NEXT: slli a6, a6, 1 -; RV32I-NEXT: sll a6, a6, a7 +; RV32I-NEXT: sll a1, a1, t0 +; RV32I-NEXT: sll a6, a6, t0 ; RV32I-NEXT: addi a0, a5, -32 -; RV32I-NEXT: or a6, a6, t0 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a6, a6, t1 ; RV32I-NEXT: bltz a0, .LBB19_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: sll a3, a2, a5 @@ -1722,27 +1722,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; ; RV32ZBB-LABEL: rotr_64_mask_shared: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: andi a7, a4, 32 +; RV32ZBB-NEXT: andi t0, a4, 32 ; RV32ZBB-NEXT: mv a6, a1 -; RV32ZBB-NEXT: beqz a7, .LBB19_2 +; RV32ZBB-NEXT: beqz t0, .LBB19_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a6, a0 ; RV32ZBB-NEXT: .LBB19_2: ; RV32ZBB-NEXT: andi a5, a4, 63 -; RV32ZBB-NEXT: srl t0, a6, a4 -; RV32ZBB-NEXT: beqz a7, .LBB19_4 +; RV32ZBB-NEXT: srl a7, a6, a4 +; RV32ZBB-NEXT: beqz t0, .LBB19_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv a0, a1 ; RV32ZBB-NEXT: .LBB19_4: ; RV32ZBB-NEXT: slli a1, a0, 1 -; RV32ZBB-NEXT: not a7, a4 -; RV32ZBB-NEXT: sll a1, a1, a7 -; RV32ZBB-NEXT: or a1, a1, t0 -; RV32ZBB-NEXT: srl t0, a0, a4 +; RV32ZBB-NEXT: not t0, a4 +; RV32ZBB-NEXT: srl t1, a0, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 -; RV32ZBB-NEXT: sll a6, a6, a7 +; RV32ZBB-NEXT: sll a1, a1, t0 +; RV32ZBB-NEXT: sll a6, a6, t0 ; RV32ZBB-NEXT: addi a0, a5, -32 -; RV32ZBB-NEXT: or a6, a6, t0 +; RV32ZBB-NEXT: or a1, a1, a7 +; RV32ZBB-NEXT: or a6, a6, t1 ; RV32ZBB-NEXT: bltz a0, .LBB19_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: sll a3, a2, a5 @@ -1772,27 +1772,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; ; RV32XTHEADBB-LABEL: rotr_64_mask_shared: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: andi a7, a4, 32 +; RV32XTHEADBB-NEXT: andi t0, a4, 32 ; RV32XTHEADBB-NEXT: mv a6, a1 -; RV32XTHEADBB-NEXT: beqz a7, .LBB19_2 +; RV32XTHEADBB-NEXT: beqz t0, .LBB19_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a6, a0 ; RV32XTHEADBB-NEXT: .LBB19_2: ; RV32XTHEADBB-NEXT: andi a5, a4, 63 -; RV32XTHEADBB-NEXT: srl t0, a6, a4 -; RV32XTHEADBB-NEXT: beqz a7, .LBB19_4 +; RV32XTHEADBB-NEXT: srl a7, a6, a4 +; RV32XTHEADBB-NEXT: beqz t0, .LBB19_4 ; RV32XTHEADBB-NEXT: # %bb.3: ; RV32XTHEADBB-NEXT: mv a0, a1 ; RV32XTHEADBB-NEXT: .LBB19_4: ; RV32XTHEADBB-NEXT: slli a1, a0, 1 -; RV32XTHEADBB-NEXT: not a7, a4 -; RV32XTHEADBB-NEXT: sll a1, a1, a7 -; RV32XTHEADBB-NEXT: or a1, a1, t0 -; RV32XTHEADBB-NEXT: srl t0, a0, a4 +; RV32XTHEADBB-NEXT: not t0, a4 +; RV32XTHEADBB-NEXT: srl t1, a0, a4 ; RV32XTHEADBB-NEXT: slli a6, a6, 1 -; RV32XTHEADBB-NEXT: sll a6, a6, a7 +; RV32XTHEADBB-NEXT: sll a1, a1, t0 +; RV32XTHEADBB-NEXT: sll a6, a6, t0 ; RV32XTHEADBB-NEXT: addi a0, a5, -32 -; RV32XTHEADBB-NEXT: or a6, a6, t0 +; RV32XTHEADBB-NEXT: or a1, a1, a7 +; RV32XTHEADBB-NEXT: or a6, a6, t1 ; RV32XTHEADBB-NEXT: bltz a0, .LBB19_6 ; RV32XTHEADBB-NEXT: # %bb.5: ; RV32XTHEADBB-NEXT: sll a3, a2, a5 @@ -1835,10 +1835,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV32I: # %bb.0: ; RV32I-NEXT: sll a3, a0, a2 ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: srl a0, a0, a4 -; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: sll a2, a1, a2 +; RV32I-NEXT: srl a0, a0, a4 ; RV32I-NEXT: srl a1, a1, a4 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: ret @@ -1847,10 +1847,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a2 ; RV64I-NEXT: negw a4, a2 -; RV64I-NEXT: srlw a0, a0, a4 -; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sllw a2, a1, a2 +; RV64I-NEXT: srlw a0, a0, a4 ; RV64I-NEXT: srlw a1, a1, a4 +; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: ret @@ -1873,10 +1873,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: sll a3, a0, a2 ; RV32XTHEADBB-NEXT: neg a4, a2 -; RV32XTHEADBB-NEXT: srl a0, a0, a4 -; RV32XTHEADBB-NEXT: or a0, a3, a0 ; RV32XTHEADBB-NEXT: sll a2, a1, a2 +; RV32XTHEADBB-NEXT: srl a0, a0, a4 ; RV32XTHEADBB-NEXT: srl a1, a1, a4 +; RV32XTHEADBB-NEXT: or a0, a3, a0 ; RV32XTHEADBB-NEXT: or a1, a2, a1 ; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: ret @@ -1885,10 +1885,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a3, a0, a2 ; RV64XTHEADBB-NEXT: negw a4, a2 -; RV64XTHEADBB-NEXT: srlw a0, a0, a4 -; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 +; RV64XTHEADBB-NEXT: srlw a0, a0, a4 ; RV64XTHEADBB-NEXT: srlw a1, a1, a4 +; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: or a1, a2, a1 ; RV64XTHEADBB-NEXT: addw a0, a0, a1 ; RV64XTHEADBB-NEXT: ret @@ -1914,45 +1914,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB21_4: ; RV32I-NEXT: sll a7, a6, a4 -; RV32I-NEXT: srli t0, a0, 1 +; RV32I-NEXT: srli t1, a0, 1 ; RV32I-NEXT: not a1, a4 -; RV32I-NEXT: srl t0, t0, a1 -; RV32I-NEXT: sll t1, a0, a4 +; RV32I-NEXT: sll t0, a0, a4 ; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: srl t2, a0, a1 +; RV32I-NEXT: srl a6, t1, a1 +; RV32I-NEXT: srl t1, a0, a1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: bnez a5, .LBB21_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: .LBB21_6: -; RV32I-NEXT: or a6, a7, t0 -; RV32I-NEXT: or a7, t1, t2 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, t0, t1 ; RV32I-NEXT: sll t0, a0, a4 ; RV32I-NEXT: bnez a5, .LBB21_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv a2, a3 ; RV32I-NEXT: .LBB21_8: ; RV32I-NEXT: srli a3, a2, 1 -; RV32I-NEXT: srl a3, a3, a1 -; RV32I-NEXT: or a3, t0, a3 ; RV32I-NEXT: sll a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: srl a3, a3, a1 ; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: or a1, t0, a3 ; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: add a1, a7, a0 -; RV32I-NEXT: add a0, a6, a3 -; RV32I-NEXT: sltu a2, a0, a6 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a7, a7, a0 +; RV32I-NEXT: add a0, a6, a1 +; RV32I-NEXT: sltu a1, a0, a6 +; RV32I-NEXT: add a1, a7, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: rotl_64_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a3, a0, a2 ; RV64I-NEXT: negw a4, a2 -; RV64I-NEXT: srl a0, a0, a4 -; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sll a2, a1, a2 +; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: srl a1, a1, a4 +; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret @@ -1971,35 +1971,35 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32ZBB-NEXT: mv a0, a1 ; RV32ZBB-NEXT: .LBB21_4: ; RV32ZBB-NEXT: sll a7, a6, a4 -; RV32ZBB-NEXT: srli t0, a0, 1 +; RV32ZBB-NEXT: srli t1, a0, 1 ; RV32ZBB-NEXT: not a1, a4 -; RV32ZBB-NEXT: srl t0, t0, a1 -; RV32ZBB-NEXT: sll t1, a0, a4 +; RV32ZBB-NEXT: sll t0, a0, a4 ; RV32ZBB-NEXT: srli a0, a6, 1 -; RV32ZBB-NEXT: srl t2, a0, a1 +; RV32ZBB-NEXT: srl a6, t1, a1 +; RV32ZBB-NEXT: srl t1, a0, a1 ; RV32ZBB-NEXT: mv a0, a3 ; RV32ZBB-NEXT: bnez a5, .LBB21_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv a0, a2 ; RV32ZBB-NEXT: .LBB21_6: -; RV32ZBB-NEXT: or a6, a7, t0 -; RV32ZBB-NEXT: or a7, t1, t2 +; RV32ZBB-NEXT: or a6, a7, a6 +; RV32ZBB-NEXT: or a7, t0, t1 ; RV32ZBB-NEXT: sll t0, a0, a4 ; RV32ZBB-NEXT: bnez a5, .LBB21_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv a2, a3 ; RV32ZBB-NEXT: .LBB21_8: ; RV32ZBB-NEXT: srli a3, a2, 1 -; RV32ZBB-NEXT: srl a3, a3, a1 -; RV32ZBB-NEXT: or a3, t0, a3 ; RV32ZBB-NEXT: sll a2, a2, a4 ; RV32ZBB-NEXT: srli a0, a0, 1 +; RV32ZBB-NEXT: srl a3, a3, a1 ; RV32ZBB-NEXT: srl a0, a0, a1 +; RV32ZBB-NEXT: or a1, t0, a3 ; RV32ZBB-NEXT: or a0, a2, a0 -; RV32ZBB-NEXT: add a1, a7, a0 -; RV32ZBB-NEXT: add a0, a6, a3 -; RV32ZBB-NEXT: sltu a2, a0, a6 -; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: add a7, a7, a0 +; RV32ZBB-NEXT: add a0, a6, a1 +; RV32ZBB-NEXT: sltu a1, a0, a6 +; RV32ZBB-NEXT: add a1, a7, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotl_64_mask_multiple: @@ -2022,45 +2022,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32XTHEADBB-NEXT: mv a0, a1 ; RV32XTHEADBB-NEXT: .LBB21_4: ; RV32XTHEADBB-NEXT: sll a7, a6, a4 -; RV32XTHEADBB-NEXT: srli t0, a0, 1 +; RV32XTHEADBB-NEXT: srli t1, a0, 1 ; RV32XTHEADBB-NEXT: not a1, a4 -; RV32XTHEADBB-NEXT: srl t0, t0, a1 -; RV32XTHEADBB-NEXT: sll t1, a0, a4 +; RV32XTHEADBB-NEXT: sll t0, a0, a4 ; RV32XTHEADBB-NEXT: srli a0, a6, 1 -; RV32XTHEADBB-NEXT: srl t2, a0, a1 +; RV32XTHEADBB-NEXT: srl a6, t1, a1 +; RV32XTHEADBB-NEXT: srl t1, a0, a1 ; RV32XTHEADBB-NEXT: mv a0, a3 ; RV32XTHEADBB-NEXT: bnez a5, .LBB21_6 ; RV32XTHEADBB-NEXT: # %bb.5: ; RV32XTHEADBB-NEXT: mv a0, a2 ; RV32XTHEADBB-NEXT: .LBB21_6: -; RV32XTHEADBB-NEXT: or a6, a7, t0 -; RV32XTHEADBB-NEXT: or a7, t1, t2 +; RV32XTHEADBB-NEXT: or a6, a7, a6 +; RV32XTHEADBB-NEXT: or a7, t0, t1 ; RV32XTHEADBB-NEXT: sll t0, a0, a4 ; RV32XTHEADBB-NEXT: bnez a5, .LBB21_8 ; RV32XTHEADBB-NEXT: # %bb.7: ; RV32XTHEADBB-NEXT: mv a2, a3 ; RV32XTHEADBB-NEXT: .LBB21_8: ; RV32XTHEADBB-NEXT: srli a3, a2, 1 -; RV32XTHEADBB-NEXT: srl a3, a3, a1 -; RV32XTHEADBB-NEXT: or a3, t0, a3 ; RV32XTHEADBB-NEXT: sll a2, a2, a4 ; RV32XTHEADBB-NEXT: srli a0, a0, 1 +; RV32XTHEADBB-NEXT: srl a3, a3, a1 ; RV32XTHEADBB-NEXT: srl a0, a0, a1 +; RV32XTHEADBB-NEXT: or a1, t0, a3 ; RV32XTHEADBB-NEXT: or a0, a2, a0 -; RV32XTHEADBB-NEXT: add a1, a7, a0 -; RV32XTHEADBB-NEXT: add a0, a6, a3 -; RV32XTHEADBB-NEXT: sltu a2, a0, a6 -; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: add a7, a7, a0 +; RV32XTHEADBB-NEXT: add a0, a6, a1 +; RV32XTHEADBB-NEXT: sltu a1, a0, a6 +; RV32XTHEADBB-NEXT: add a1, a7, a1 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a3, a0, a2 ; RV64XTHEADBB-NEXT: negw a4, a2 -; RV64XTHEADBB-NEXT: srl a0, a0, a4 -; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 +; RV64XTHEADBB-NEXT: srl a0, a0, a4 ; RV64XTHEADBB-NEXT: srl a1, a1, a4 +; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: or a1, a2, a1 ; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: ret @@ -2076,10 +2076,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV32I: # %bb.0: ; RV32I-NEXT: srl a3, a0, a2 ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: sll a0, a0, a4 -; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: sll a0, a0, a4 ; RV32I-NEXT: sll a1, a1, a4 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: ret @@ -2088,10 +2088,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a2 ; RV64I-NEXT: negw a4, a2 -; RV64I-NEXT: sllw a0, a0, a4 -; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: srlw a2, a1, a2 +; RV64I-NEXT: sllw a0, a0, a4 ; RV64I-NEXT: sllw a1, a1, a4 +; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: ret @@ -2114,10 +2114,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: srl a3, a0, a2 ; RV32XTHEADBB-NEXT: neg a4, a2 -; RV32XTHEADBB-NEXT: sll a0, a0, a4 -; RV32XTHEADBB-NEXT: or a0, a3, a0 ; RV32XTHEADBB-NEXT: srl a2, a1, a2 +; RV32XTHEADBB-NEXT: sll a0, a0, a4 ; RV32XTHEADBB-NEXT: sll a1, a1, a4 +; RV32XTHEADBB-NEXT: or a0, a3, a0 ; RV32XTHEADBB-NEXT: or a1, a2, a1 ; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: ret @@ -2126,10 +2126,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a3, a0, a2 ; RV64XTHEADBB-NEXT: negw a4, a2 -; RV64XTHEADBB-NEXT: sllw a0, a0, a4 -; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 +; RV64XTHEADBB-NEXT: sllw a0, a0, a4 ; RV64XTHEADBB-NEXT: sllw a1, a1, a4 +; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: or a1, a2, a1 ; RV64XTHEADBB-NEXT: addw a0, a0, a1 ; RV64XTHEADBB-NEXT: ret @@ -2154,30 +2154,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: .LBB23_4: ; RV32I-NEXT: srl a7, a6, a4 -; RV32I-NEXT: slli t0, a1, 1 +; RV32I-NEXT: slli t1, a1, 1 ; RV32I-NEXT: not a0, a4 -; RV32I-NEXT: sll t0, t0, a0 -; RV32I-NEXT: srl t1, a1, a4 +; RV32I-NEXT: srl t0, a1, a4 ; RV32I-NEXT: slli a6, a6, 1 -; RV32I-NEXT: sll t2, a6, a0 +; RV32I-NEXT: sll a1, t1, a0 +; RV32I-NEXT: sll t1, a6, a0 ; RV32I-NEXT: mv a6, a2 ; RV32I-NEXT: beqz a5, .LBB23_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv a6, a3 ; RV32I-NEXT: .LBB23_6: -; RV32I-NEXT: or a1, t0, a7 -; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a7, t1, t0 ; RV32I-NEXT: srl t0, a6, a4 ; RV32I-NEXT: beqz a5, .LBB23_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv a3, a2 ; RV32I-NEXT: .LBB23_8: ; RV32I-NEXT: slli a2, a3, 1 -; RV32I-NEXT: sll a2, a2, a0 -; RV32I-NEXT: or a2, a2, t0 ; RV32I-NEXT: srl a3, a3, a4 ; RV32I-NEXT: slli a6, a6, 1 +; RV32I-NEXT: sll a2, a2, a0 ; RV32I-NEXT: sll a0, a6, a0 +; RV32I-NEXT: or a2, a2, t0 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: add a7, a7, a0 ; RV32I-NEXT: add a0, a1, a2 @@ -2189,10 +2189,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srl a3, a0, a2 ; RV64I-NEXT: negw a4, a2 -; RV64I-NEXT: sll a0, a0, a4 -; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: srl a2, a1, a2 +; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: sll a1, a1, a4 +; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret @@ -2210,30 +2210,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32ZBB-NEXT: mv a1, a0 ; RV32ZBB-NEXT: .LBB23_4: ; RV32ZBB-NEXT: srl a7, a6, a4 -; RV32ZBB-NEXT: slli t0, a1, 1 +; RV32ZBB-NEXT: slli t1, a1, 1 ; RV32ZBB-NEXT: not a0, a4 -; RV32ZBB-NEXT: sll t0, t0, a0 -; RV32ZBB-NEXT: srl t1, a1, a4 +; RV32ZBB-NEXT: srl t0, a1, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 -; RV32ZBB-NEXT: sll t2, a6, a0 +; RV32ZBB-NEXT: sll a1, t1, a0 +; RV32ZBB-NEXT: sll t1, a6, a0 ; RV32ZBB-NEXT: mv a6, a2 ; RV32ZBB-NEXT: beqz a5, .LBB23_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv a6, a3 ; RV32ZBB-NEXT: .LBB23_6: -; RV32ZBB-NEXT: or a1, t0, a7 -; RV32ZBB-NEXT: or a7, t2, t1 +; RV32ZBB-NEXT: or a1, a1, a7 +; RV32ZBB-NEXT: or a7, t1, t0 ; RV32ZBB-NEXT: srl t0, a6, a4 ; RV32ZBB-NEXT: beqz a5, .LBB23_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv a3, a2 ; RV32ZBB-NEXT: .LBB23_8: ; RV32ZBB-NEXT: slli a2, a3, 1 -; RV32ZBB-NEXT: sll a2, a2, a0 -; RV32ZBB-NEXT: or a2, a2, t0 ; RV32ZBB-NEXT: srl a3, a3, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 +; RV32ZBB-NEXT: sll a2, a2, a0 ; RV32ZBB-NEXT: sll a0, a6, a0 +; RV32ZBB-NEXT: or a2, a2, t0 ; RV32ZBB-NEXT: or a0, a0, a3 ; RV32ZBB-NEXT: add a7, a7, a0 ; RV32ZBB-NEXT: add a0, a1, a2 @@ -2261,30 +2261,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV32XTHEADBB-NEXT: mv a1, a0 ; RV32XTHEADBB-NEXT: .LBB23_4: ; RV32XTHEADBB-NEXT: srl a7, a6, a4 -; RV32XTHEADBB-NEXT: slli t0, a1, 1 +; RV32XTHEADBB-NEXT: slli t1, a1, 1 ; RV32XTHEADBB-NEXT: not a0, a4 -; RV32XTHEADBB-NEXT: sll t0, t0, a0 -; RV32XTHEADBB-NEXT: srl t1, a1, a4 +; RV32XTHEADBB-NEXT: srl t0, a1, a4 ; RV32XTHEADBB-NEXT: slli a6, a6, 1 -; RV32XTHEADBB-NEXT: sll t2, a6, a0 +; RV32XTHEADBB-NEXT: sll a1, t1, a0 +; RV32XTHEADBB-NEXT: sll t1, a6, a0 ; RV32XTHEADBB-NEXT: mv a6, a2 ; RV32XTHEADBB-NEXT: beqz a5, .LBB23_6 ; RV32XTHEADBB-NEXT: # %bb.5: ; RV32XTHEADBB-NEXT: mv a6, a3 ; RV32XTHEADBB-NEXT: .LBB23_6: -; RV32XTHEADBB-NEXT: or a1, t0, a7 -; RV32XTHEADBB-NEXT: or a7, t2, t1 +; RV32XTHEADBB-NEXT: or a1, a1, a7 +; RV32XTHEADBB-NEXT: or a7, t1, t0 ; RV32XTHEADBB-NEXT: srl t0, a6, a4 ; RV32XTHEADBB-NEXT: beqz a5, .LBB23_8 ; RV32XTHEADBB-NEXT: # %bb.7: ; RV32XTHEADBB-NEXT: mv a3, a2 ; RV32XTHEADBB-NEXT: .LBB23_8: ; RV32XTHEADBB-NEXT: slli a2, a3, 1 -; RV32XTHEADBB-NEXT: sll a2, a2, a0 -; RV32XTHEADBB-NEXT: or a2, a2, t0 ; RV32XTHEADBB-NEXT: srl a3, a3, a4 ; RV32XTHEADBB-NEXT: slli a6, a6, 1 +; RV32XTHEADBB-NEXT: sll a2, a2, a0 ; RV32XTHEADBB-NEXT: sll a0, a6, a0 +; RV32XTHEADBB-NEXT: or a2, a2, t0 ; RV32XTHEADBB-NEXT: or a0, a0, a3 ; RV32XTHEADBB-NEXT: add a7, a7, a0 ; RV32XTHEADBB-NEXT: add a0, a1, a2 @@ -2296,10 +2296,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a3, a0, a2 ; RV64XTHEADBB-NEXT: negw a4, a2 -; RV64XTHEADBB-NEXT: sll a0, a0, a4 -; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 +; RV64XTHEADBB-NEXT: sll a0, a0, a4 ; RV64XTHEADBB-NEXT: sll a1, a1, a4 +; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: or a1, a2, a1 ; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: ret @@ -2328,9 +2328,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-NEXT: or a3, a3, a7 ; RV32I-NEXT: .LBB24_3: ; RV32I-NEXT: srai a6, a6, 31 +; RV32I-NEXT: li a7, 32 ; RV32I-NEXT: and a5, a6, a5 -; RV32I-NEXT: li a6, 32 -; RV32I-NEXT: sub a6, a6, a2 +; RV32I-NEXT: sub a6, a7, a2 ; RV32I-NEXT: srl a7, a1, a4 ; RV32I-NEXT: bltz a6, .LBB24_5 ; RV32I-NEXT: # %bb.4: @@ -2338,8 +2338,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-NEXT: j .LBB24_6 ; RV32I-NEXT: .LBB24_5: ; RV32I-NEXT: li t0, 64 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: not a2, a2 ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: sll a1, a1, a2 @@ -2376,9 +2376,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-NEXT: or a3, a3, a7 ; RV32ZBB-NEXT: .LBB24_3: ; RV32ZBB-NEXT: srai a6, a6, 31 +; RV32ZBB-NEXT: li a7, 32 ; RV32ZBB-NEXT: and a5, a6, a5 -; RV32ZBB-NEXT: li a6, 32 -; RV32ZBB-NEXT: sub a6, a6, a2 +; RV32ZBB-NEXT: sub a6, a7, a2 ; RV32ZBB-NEXT: srl a7, a1, a4 ; RV32ZBB-NEXT: bltz a6, .LBB24_5 ; RV32ZBB-NEXT: # %bb.4: @@ -2386,8 +2386,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-NEXT: j .LBB24_6 ; RV32ZBB-NEXT: .LBB24_5: ; RV32ZBB-NEXT: li t0, 64 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: srl a0, a0, a4 +; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: not a2, a2 ; RV32ZBB-NEXT: slli a1, a1, 1 ; RV32ZBB-NEXT: sll a1, a1, a2 @@ -2421,9 +2421,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-NEXT: or a3, a3, a7 ; RV32XTHEADBB-NEXT: .LBB24_3: ; RV32XTHEADBB-NEXT: srai a6, a6, 31 +; RV32XTHEADBB-NEXT: li a7, 32 ; RV32XTHEADBB-NEXT: and a5, a6, a5 -; RV32XTHEADBB-NEXT: li a6, 32 -; RV32XTHEADBB-NEXT: sub a6, a6, a2 +; RV32XTHEADBB-NEXT: sub a6, a7, a2 ; RV32XTHEADBB-NEXT: srl a7, a1, a4 ; RV32XTHEADBB-NEXT: bltz a6, .LBB24_5 ; RV32XTHEADBB-NEXT: # %bb.4: @@ -2431,8 +2431,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-NEXT: j .LBB24_6 ; RV32XTHEADBB-NEXT: .LBB24_5: ; RV32XTHEADBB-NEXT: li t0, 64 -; RV32XTHEADBB-NEXT: sub a2, t0, a2 ; RV32XTHEADBB-NEXT: srl a0, a0, a4 +; RV32XTHEADBB-NEXT: sub a2, t0, a2 ; RV32XTHEADBB-NEXT: not a2, a2 ; RV32XTHEADBB-NEXT: slli a1, a1, 1 ; RV32XTHEADBB-NEXT: sll a1, a1, a2 @@ -2478,9 +2478,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-NEXT: or a3, a3, a7 ; RV32I-NEXT: .LBB25_3: ; RV32I-NEXT: srai a6, a6, 31 +; RV32I-NEXT: li a7, 32 ; RV32I-NEXT: and a5, a6, a5 -; RV32I-NEXT: li a6, 32 -; RV32I-NEXT: sub a6, a6, a2 +; RV32I-NEXT: sub a6, a7, a2 ; RV32I-NEXT: sll a7, a0, a4 ; RV32I-NEXT: bltz a6, .LBB25_5 ; RV32I-NEXT: # %bb.4: @@ -2488,8 +2488,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-NEXT: j .LBB25_6 ; RV32I-NEXT: .LBB25_5: ; RV32I-NEXT: li t0, 64 -; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sll a1, a1, a4 +; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: not a2, a2 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srl a0, a0, a2 @@ -2526,9 +2526,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-NEXT: or a3, a3, a7 ; RV32ZBB-NEXT: .LBB25_3: ; RV32ZBB-NEXT: srai a6, a6, 31 +; RV32ZBB-NEXT: li a7, 32 ; RV32ZBB-NEXT: and a5, a6, a5 -; RV32ZBB-NEXT: li a6, 32 -; RV32ZBB-NEXT: sub a6, a6, a2 +; RV32ZBB-NEXT: sub a6, a7, a2 ; RV32ZBB-NEXT: sll a7, a0, a4 ; RV32ZBB-NEXT: bltz a6, .LBB25_5 ; RV32ZBB-NEXT: # %bb.4: @@ -2536,8 +2536,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-NEXT: j .LBB25_6 ; RV32ZBB-NEXT: .LBB25_5: ; RV32ZBB-NEXT: li t0, 64 -; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sll a1, a1, a4 +; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: not a2, a2 ; RV32ZBB-NEXT: srli a0, a0, 1 ; RV32ZBB-NEXT: srl a0, a0, a2 @@ -2571,9 +2571,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-NEXT: or a3, a3, a7 ; RV32XTHEADBB-NEXT: .LBB25_3: ; RV32XTHEADBB-NEXT: srai a6, a6, 31 +; RV32XTHEADBB-NEXT: li a7, 32 ; RV32XTHEADBB-NEXT: and a5, a6, a5 -; RV32XTHEADBB-NEXT: li a6, 32 -; RV32XTHEADBB-NEXT: sub a6, a6, a2 +; RV32XTHEADBB-NEXT: sub a6, a7, a2 ; RV32XTHEADBB-NEXT: sll a7, a0, a4 ; RV32XTHEADBB-NEXT: bltz a6, .LBB25_5 ; RV32XTHEADBB-NEXT: # %bb.4: @@ -2581,8 +2581,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-NEXT: j .LBB25_6 ; RV32XTHEADBB-NEXT: .LBB25_5: ; RV32XTHEADBB-NEXT: li t0, 64 -; RV32XTHEADBB-NEXT: sub a2, t0, a2 ; RV32XTHEADBB-NEXT: sll a1, a1, a4 +; RV32XTHEADBB-NEXT: sub a2, t0, a2 ; RV32XTHEADBB-NEXT: not a2, a2 ; RV32XTHEADBB-NEXT: srli a0, a0, 1 ; RV32XTHEADBB-NEXT: srl a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index 248d6209d5823..4bb8d6c248caa 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false ; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 16 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: bnez a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a1, a0, 1 @@ -440,11 +440,11 @@ define i32 @bswap_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: addi a2, a2, -256 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 @@ -466,25 +466,24 @@ define i64 @bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: srli a4, a1, 24 +; RV32I-NEXT: srli a5, a0, 8 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: or a2, a1, a2 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a5, a1, 24 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a0, a3 +; RV32I-NEXT: or a0, a1, a2 +; RV32I-NEXT: or a1, a3, a4 ; RV32I-NEXT: ret ; ; RV32XTHEADBB-LABEL: bswap_i64: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll index 4e958f5699adb..b6344f88cddaa 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll @@ -138,27 +138,26 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: rol_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: slli a3, a2, 26 -; CHECK-NEXT: srli a3, a3, 31 -; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: bnez a3, .LBB7_2 +; CHECK-NEXT: slli a5, a2, 26 +; CHECK-NEXT: srli a5, a5, 31 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: bnez a5, .LBB7_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: sll a5, a4, a2 -; CHECK-NEXT: bnez a3, .LBB7_4 +; CHECK-NEXT: sll a4, a3, a2 +; CHECK-NEXT: bnez a5, .LBB7_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: not a6, a2 -; CHECK-NEXT: srl a3, a1, a6 -; CHECK-NEXT: or a3, a5, a3 -; CHECK-NEXT: sll a0, a0, a2 -; CHECK-NEXT: srli a4, a4, 1 -; CHECK-NEXT: srl a1, a4, a6 -; CHECK-NEXT: or a1, a0, a1 -; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: not a5, a2 +; CHECK-NEXT: sll a2, a0, a2 +; CHECK-NEXT: srli a3, a3, 1 +; CHECK-NEXT: srl a0, a1, a5 +; CHECK-NEXT: srl a1, a3, a5 +; CHECK-NEXT: or a0, a4, a0 +; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: ret %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b) ret i64 %or @@ -191,24 +190,24 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: ror_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a4, a2, 32 +; CHECK-NEXT: andi a5, a2, 32 ; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: beqz a4, .LBB9_2 +; CHECK-NEXT: beqz a5, .LBB9_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: srl a5, a3, a2 -; CHECK-NEXT: beqz a4, .LBB9_4 +; CHECK-NEXT: srl a4, a3, a2 +; CHECK-NEXT: beqz a5, .LBB9_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: .LBB9_4: ; CHECK-NEXT: slli a0, a1, 1 -; CHECK-NEXT: not a4, a2 -; CHECK-NEXT: sll a0, a0, a4 -; CHECK-NEXT: or a0, a0, a5 +; CHECK-NEXT: not a5, a2 ; CHECK-NEXT: srl a1, a1, a2 ; CHECK-NEXT: slli a3, a3, 1 -; CHECK-NEXT: sll a2, a3, a4 +; CHECK-NEXT: sll a0, a0, a5 +; CHECK-NEXT: sll a2, a3, a5 +; CHECK-NEXT: or a0, a0, a4 ; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: ret %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) @@ -252,11 +251,10 @@ define i64 @rori_i64(i64 %a) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: srli a2, a0, 1 ; CHECK-NEXT: slli a3, a1, 31 -; CHECK-NEXT: or a2, a3, a2 ; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: slli a0, a0, 31 -; CHECK-NEXT: or a1, a0, a1 -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: slli a4, a0, 31 +; CHECK-NEXT: or a0, a3, a2 +; CHECK-NEXT: or a1, a4, a1 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63) ret i64 %1 @@ -267,11 +265,10 @@ define i64 @rori_i64_fshr(i64 %a) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: srli a2, a1, 31 ; CHECK-NEXT: slli a3, a0, 1 -; CHECK-NEXT: or a2, a3, a2 -; CHECK-NEXT: srli a0, a0, 31 +; CHECK-NEXT: srli a4, a0, 31 ; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: or a1, a1, a0 -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: or a0, a3, a2 +; CHECK-NEXT: or a1, a1, a4 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63) ret i64 %1 @@ -299,12 +296,12 @@ define i64 @not_shl_one_i64(i64 %x) { ; CHECK-LABEL: not_shl_one_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, a0, -32 +; CHECK-NEXT: li a2, 1 ; CHECK-NEXT: slti a1, a1, 0 +; CHECK-NEXT: sll a0, a2, a0 ; CHECK-NEXT: neg a2, a1 -; CHECK-NEXT: li a3, 1 -; CHECK-NEXT: sll a0, a3, a0 -; CHECK-NEXT: and a2, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a2, a2, a0 ; CHECK-NEXT: and a1, a1, a0 ; CHECK-NEXT: not a0, a2 ; CHECK-NEXT: not a1, a1 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index af2ea35cf26c1..90a8eadb3f974 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false ; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 8 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a2, a0, 16 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: lui a5, 61681 ; RV32I-NEXT: addi a4, a2, 1365 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a3, a2, 819 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: addi a2, a5, -241 ; RV32I-NEXT: bnez a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a1, a0, 1 @@ -257,17 +257,17 @@ define i32 @ctpop_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi a2, a2, 1365 ; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: addi a1, a2, -241 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: slli a1, a0, 8 ; RV32I-NEXT: add a0, a0, a1 @@ -367,39 +367,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a4, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: add a0, a4, a0 -; RV32I-NEXT: srli a4, a0, 4 -; RV32I-NEXT: add a0, a0, a4 -; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a4, a4, -241 -; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: slli a5, a0, 16 -; RV32I-NEXT: add a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a5, a1, 1 ; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: and a3, a1, a2 +; RV32I-NEXT: and a2, a0, a4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a3, a1, a4 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: srli a2, a1, 4 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a0, a0, a4 ; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: slli a2, a1, 8 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a1, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: ret ; @@ -417,9 +417,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: addi a2, a1, -1 ; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: ret ; @@ -440,9 +440,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: addi a2, a1, -1 ; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: ret ; @@ -451,8 +451,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a0, a0 ; RV32ZBB-NEXT: sltiu a0, a0, 2 -; RV32ZBB-NEXT: xori a0, a0, 1 ; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 ; RV32ZBB-NEXT: xori a1, a1, 1 ; RV32ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -476,8 +476,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a0, a0 ; RV32ZBB-NEXT: addi a0, a0, -1 -; RV32ZBB-NEXT: seqz a0, a0 ; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: seqz a0, a0 ; RV32ZBB-NEXT: seqz a1, a1 ; RV32ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -491,10 +491,10 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: xori a0, a0, 1 ; RV32I-NEXT: addi a2, a1, -1 ; RV32I-NEXT: xor a1, a1, a2 ; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: xori a0, a0, 1 ; RV32I-NEXT: xori a1, a1, 1 ; RV32I-NEXT: ret ; @@ -503,8 +503,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a0, a0 ; RV32ZBB-NEXT: addi a0, a0, -1 -; RV32ZBB-NEXT: snez a0, a0 ; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: snez a0, a0 ; RV32ZBB-NEXT: snez a1, a1 ; RV32ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -519,39 +519,39 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 1 ; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: srli a5, a0, 1 ; RV32I-NEXT: addi a3, a3, 1365 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a4, a1, a2 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: srli a4, a1, 4 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a4, a4, -241 -; RV32I-NEXT: and a1, a1, a4 -; RV32I-NEXT: slli a5, a1, 8 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: slli a5, a1, 16 -; RV32I-NEXT: add a1, a1, a5 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a5, a0, 1 ; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: addi a5, a5, -241 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: sub a0, a0, a3 -; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: and a2, a1, a4 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a3, a0, a4 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: srli a2, a0, 4 -; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: slli a2, a0, 8 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: slli a2, a0, 16 -; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: srli a3, a0, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: slli a2, a1, 8 +; RV32I-NEXT: slli a3, a0, 8 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a2, a1, 16 +; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 0 @@ -682,77 +682,77 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srli a5, a3, 1 -; RV32I-NEXT: lui a6, 349525 -; RV32I-NEXT: addi a6, a6, 1365 -; RV32I-NEXT: and a5, a5, a6 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: lui a5, 209715 -; RV32I-NEXT: addi a5, a5, 819 -; RV32I-NEXT: and a7, a3, a5 +; RV32I-NEXT: lui a5, 349525 +; RV32I-NEXT: addi a5, a5, 1365 +; RV32I-NEXT: srli a6, a4, 1 +; RV32I-NEXT: srli a7, a3, 1 +; RV32I-NEXT: srli t0, a1, 1 +; RV32I-NEXT: srli t1, a2, 1 +; RV32I-NEXT: and a6, a6, a5 +; RV32I-NEXT: and a7, a7, a5 +; RV32I-NEXT: and t0, t0, a5 +; RV32I-NEXT: and a5, t1, a5 +; RV32I-NEXT: lui t1, 209715 +; RV32I-NEXT: addi t1, t1, 819 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: and a5, a4, t1 +; RV32I-NEXT: srli a4, a4, 2 +; RV32I-NEXT: and a6, a3, t1 ; RV32I-NEXT: srli a3, a3, 2 -; RV32I-NEXT: and a3, a3, a5 -; RV32I-NEXT: add a3, a7, a3 +; RV32I-NEXT: and a7, a1, t1 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and t0, a2, t1 +; RV32I-NEXT: srli a2, a2, 2 +; RV32I-NEXT: and a4, a4, t1 +; RV32I-NEXT: and a3, a3, t1 +; RV32I-NEXT: and a1, a1, t1 +; RV32I-NEXT: and a2, a2, t1 +; RV32I-NEXT: add a4, a5, a4 +; RV32I-NEXT: lui a5, 61681 +; RV32I-NEXT: addi a5, a5, -241 +; RV32I-NEXT: add a3, a6, a3 +; RV32I-NEXT: add a1, a7, a1 +; RV32I-NEXT: add a2, t0, a2 +; RV32I-NEXT: srli a6, a4, 4 ; RV32I-NEXT: srli a7, a3, 4 +; RV32I-NEXT: srli t0, a1, 4 +; RV32I-NEXT: add a4, a4, a6 +; RV32I-NEXT: srli a6, a2, 4 ; RV32I-NEXT: add a3, a3, a7 -; RV32I-NEXT: lui a7, 61681 -; RV32I-NEXT: addi a7, a7, -241 -; RV32I-NEXT: and a3, a3, a7 -; RV32I-NEXT: slli t0, a3, 8 -; RV32I-NEXT: add a3, a3, t0 -; RV32I-NEXT: slli t0, a3, 16 -; RV32I-NEXT: add a3, a3, t0 -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: srli t0, a4, 1 -; RV32I-NEXT: and t0, t0, a6 -; RV32I-NEXT: sub a4, a4, t0 -; RV32I-NEXT: and t0, a4, a5 -; RV32I-NEXT: srli a4, a4, 2 +; RV32I-NEXT: add a1, a1, t0 +; RV32I-NEXT: add a2, a2, a6 ; RV32I-NEXT: and a4, a4, a5 -; RV32I-NEXT: add a4, t0, a4 -; RV32I-NEXT: srli t0, a4, 4 -; RV32I-NEXT: add a4, a4, t0 -; RV32I-NEXT: and a4, a4, a7 -; RV32I-NEXT: slli t0, a4, 8 -; RV32I-NEXT: add a4, a4, t0 -; RV32I-NEXT: slli t0, a4, 16 -; RV32I-NEXT: add a4, a4, t0 -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: add a3, a4, a3 -; RV32I-NEXT: srli a4, a1, 1 -; RV32I-NEXT: and a4, a4, a6 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: and a4, a1, a5 -; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a3, a3, a5 ; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: srli a4, a1, 4 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: and a1, a1, a7 -; RV32I-NEXT: slli a4, a1, 8 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: slli a4, a1, 16 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: srli a4, a2, 1 -; RV32I-NEXT: and a4, a4, a6 -; RV32I-NEXT: sub a2, a2, a4 -; RV32I-NEXT: and a4, a2, a5 -; RV32I-NEXT: srli a2, a2, 2 ; RV32I-NEXT: and a2, a2, a5 -; RV32I-NEXT: add a2, a4, a2 -; RV32I-NEXT: srli a4, a2, 4 -; RV32I-NEXT: add a2, a2, a4 -; RV32I-NEXT: and a2, a2, a7 -; RV32I-NEXT: slli a4, a2, 8 -; RV32I-NEXT: add a2, a2, a4 -; RV32I-NEXT: slli a4, a2, 16 -; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a5, a4, 8 +; RV32I-NEXT: slli a6, a3, 8 +; RV32I-NEXT: slli a7, a1, 8 +; RV32I-NEXT: slli t0, a2, 8 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: add a3, a3, a6 +; RV32I-NEXT: add a1, a1, a7 +; RV32I-NEXT: add a2, a2, t0 +; RV32I-NEXT: slli a5, a4, 16 +; RV32I-NEXT: slli a6, a3, 16 +; RV32I-NEXT: slli a7, a1, 16 +; RV32I-NEXT: slli t0, a2, 16 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: add a3, a3, a6 +; RV32I-NEXT: add a1, a1, a7 +; RV32I-NEXT: add a2, a2, t0 +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: srli a2, a2, 24 +; RV32I-NEXT: add a3, a3, a4 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw zero, 4(a0) @@ -764,14 +764,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a2, 4(a1) ; RV32ZBB-NEXT: lw a3, 0(a1) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: lw a4, 12(a1) +; RV32ZBB-NEXT: lw a1, 8(a1) ; RV32ZBB-NEXT: cpop a2, a2 ; RV32ZBB-NEXT: cpop a3, a3 -; RV32ZBB-NEXT: add a2, a3, a2 +; RV32ZBB-NEXT: cpop a4, a4 ; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: cpop a3, a4 -; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: add a2, a3, a2 +; RV32ZBB-NEXT: add a1, a1, a4 ; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw zero, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) @@ -787,35 +787,35 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; RV32I-NEXT: lw a1, 0(a0) ; RV32I-NEXT: lw a2, 4(a0) ; RV32I-NEXT: lw a3, 8(a0) -; RV32I-NEXT: lw a4, 12(a0) -; RV32I-NEXT: addi a0, a1, -1 -; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: addi a1, a3, -1 -; RV32I-NEXT: and a1, a3, a1 -; RV32I-NEXT: seqz a2, a3 -; RV32I-NEXT: sub a2, a4, a2 -; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: and a2, a3, a2 +; RV32I-NEXT: seqz a3, a3 +; RV32I-NEXT: sub a3, a0, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: seqz a0, a1 +; RV32I-NEXT: seqz a1, a2 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i64_ult_two: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a1, 12(a0) ; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 0(a0) -; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: lw a3, 4(a0) +; RV32ZBB-NEXT: lw a0, 0(a0) ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a3, a3 ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: cpop a2, a3 -; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: add a0, a0, a3 ; RV32ZBB-NEXT: sltiu a0, a0, 2 ; RV32ZBB-NEXT: sltiu a1, a1, 2 ; RV32ZBB-NEXT: ret @@ -830,38 +830,38 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV32I-NEXT: lw a1, 0(a0) ; RV32I-NEXT: lw a2, 4(a0) ; RV32I-NEXT: lw a3, 8(a0) -; RV32I-NEXT: lw a4, 12(a0) -; RV32I-NEXT: addi a0, a1, -1 -; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: and a1, a2, a1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: snez a0, a0 -; RV32I-NEXT: addi a1, a3, -1 -; RV32I-NEXT: and a1, a3, a1 -; RV32I-NEXT: seqz a2, a3 -; RV32I-NEXT: sub a2, a4, a2 -; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: and a2, a3, a2 +; RV32I-NEXT: seqz a3, a3 +; RV32I-NEXT: sub a3, a0, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: snez a0, a1 +; RV32I-NEXT: snez a1, a2 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i64_ugt_one: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a1, 12(a0) ; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 0(a0) -; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: lw a3, 4(a0) +; RV32ZBB-NEXT: lw a0, 0(a0) ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a3, a3 ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: cpop a2, a3 -; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: add a0, a0, a3 ; RV32ZBB-NEXT: sltiu a0, a0, 2 -; RV32ZBB-NEXT: xori a0, a0, 1 ; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 ; RV32ZBB-NEXT: xori a1, a1, 1 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -906,17 +906,17 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a1, 12(a0) ; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 0(a0) -; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: lw a3, 4(a0) +; RV32ZBB-NEXT: lw a0, 0(a0) ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a3, a3 ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: cpop a2, a3 -; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: add a0, a0, a3 ; RV32ZBB-NEXT: addi a0, a0, -1 -; RV32ZBB-NEXT: seqz a0, a0 ; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: seqz a0, a0 ; RV32ZBB-NEXT: seqz a1, a1 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -963,17 +963,17 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a1, 12(a0) ; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 0(a0) -; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: lw a3, 4(a0) +; RV32ZBB-NEXT: lw a0, 0(a0) ; RV32ZBB-NEXT: cpop a1, a1 ; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a3, a3 ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: cpop a2, a3 -; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: add a0, a0, a3 ; RV32ZBB-NEXT: addi a0, a0, -1 -; RV32ZBB-NEXT: snez a0, a0 ; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: snez a0, a0 ; RV32ZBB-NEXT: snez a1, a1 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -1300,11 +1300,11 @@ define i32 @bswap_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: addi a2, a2, -256 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 @@ -1326,25 +1326,24 @@ define i64 @bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: srli a4, a1, 24 +; RV32I-NEXT: srli a5, a0, 8 ; RV32I-NEXT: addi a3, a3, -256 ; RV32I-NEXT: and a2, a2, a3 -; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: or a2, a1, a2 -; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: and a5, a5, a3 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a5, a1, 24 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a5, a1 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a0, a3 +; RV32I-NEXT: or a0, a1, a2 +; RV32I-NEXT: or a1, a3, a4 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: bswap_i64: @@ -1405,12 +1404,12 @@ define i64 @orc_b_i64(i64 %a) { ; CHECK-NEXT: and a1, a1, a2 ; CHECK-NEXT: and a0, a0, a2 ; CHECK-NEXT: slli a2, a0, 8 -; CHECK-NEXT: sltu a3, a2, a0 -; CHECK-NEXT: srli a4, a0, 24 -; CHECK-NEXT: slli a5, a1, 8 -; CHECK-NEXT: or a4, a5, a4 -; CHECK-NEXT: sub a1, a4, a1 -; CHECK-NEXT: sub a1, a1, a3 +; CHECK-NEXT: srli a3, a0, 24 +; CHECK-NEXT: slli a4, a1, 8 +; CHECK-NEXT: sltu a5, a2, a0 +; CHECK-NEXT: or a3, a4, a3 +; CHECK-NEXT: sub a1, a3, a1 +; CHECK-NEXT: sub a1, a1, a5 ; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: ret %1 = and i64 %a, 72340172838076673 diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll index c0b9e0b3c7748..1a3beeb79b85b 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbs.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll @@ -49,14 +49,14 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: bclr_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: andi a3, a2, 63 -; RV32I-NEXT: addi a4, a3, -32 -; RV32I-NEXT: slti a4, a4, 0 +; RV32I-NEXT: li a4, 1 +; RV32I-NEXT: addi a5, a3, -32 +; RV32I-NEXT: sll a2, a4, a2 +; RV32I-NEXT: sll a3, a4, a3 +; RV32I-NEXT: slti a4, a5, 0 ; RV32I-NEXT: neg a5, a4 -; RV32I-NEXT: li a6, 1 -; RV32I-NEXT: sll a2, a6, a2 -; RV32I-NEXT: and a2, a5, a2 -; RV32I-NEXT: sll a3, a6, a3 ; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: and a2, a5, a2 ; RV32I-NEXT: and a3, a4, a3 ; RV32I-NEXT: not a2, a2 ; RV32I-NEXT: not a3, a3 @@ -67,13 +67,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind { ; RV32ZBSNOZBB-LABEL: bclr_i64: ; RV32ZBSNOZBB: # %bb.0: ; RV32ZBSNOZBB-NEXT: andi a3, a2, 63 +; RV32ZBSNOZBB-NEXT: bset a2, zero, a2 ; RV32ZBSNOZBB-NEXT: addi a4, a3, -32 +; RV32ZBSNOZBB-NEXT: bset a3, zero, a3 ; RV32ZBSNOZBB-NEXT: slti a4, a4, 0 ; RV32ZBSNOZBB-NEXT: neg a5, a4 -; RV32ZBSNOZBB-NEXT: bset a2, zero, a2 -; RV32ZBSNOZBB-NEXT: and a2, a5, a2 -; RV32ZBSNOZBB-NEXT: bset a3, zero, a3 ; RV32ZBSNOZBB-NEXT: addi a4, a4, -1 +; RV32ZBSNOZBB-NEXT: and a2, a5, a2 ; RV32ZBSNOZBB-NEXT: and a3, a4, a3 ; RV32ZBSNOZBB-NEXT: not a3, a3 ; RV32ZBSNOZBB-NEXT: not a2, a2 @@ -84,13 +84,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind { ; RV32ZBSZBB-LABEL: bclr_i64: ; RV32ZBSZBB: # %bb.0: ; RV32ZBSZBB-NEXT: andi a3, a2, 63 +; RV32ZBSZBB-NEXT: bset a2, zero, a2 ; RV32ZBSZBB-NEXT: bset a4, zero, a3 ; RV32ZBSZBB-NEXT: addi a3, a3, -32 ; RV32ZBSZBB-NEXT: slti a3, a3, 0 ; RV32ZBSZBB-NEXT: addi a5, a3, -1 -; RV32ZBSZBB-NEXT: and a4, a5, a4 ; RV32ZBSZBB-NEXT: neg a3, a3 -; RV32ZBSZBB-NEXT: bset a2, zero, a2 +; RV32ZBSZBB-NEXT: and a4, a5, a4 ; RV32ZBSZBB-NEXT: and a2, a3, a2 ; RV32ZBSZBB-NEXT: andn a0, a0, a2 ; RV32ZBSZBB-NEXT: andn a1, a1, a4 @@ -187,24 +187,24 @@ define signext i64 @bset_i64_zero(i64 signext %a) nounwind { ; RV32I-LABEL: bset_i64_zero: ; RV32I: # %bb.0: ; RV32I-NEXT: addi a1, a0, -32 +; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: slti a1, a1, 0 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: li a3, 1 -; RV32I-NEXT: sll a3, a3, a0 -; RV32I-NEXT: and a0, a2, a3 +; RV32I-NEXT: sll a2, a2, a0 +; RV32I-NEXT: neg a0, a1 ; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: ret ; ; RV32ZBS-LABEL: bset_i64_zero: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: addi a1, a0, -32 -; RV32ZBS-NEXT: slti a1, a1, 0 -; RV32ZBS-NEXT: neg a2, a1 -; RV32ZBS-NEXT: bset a3, zero, a0 -; RV32ZBS-NEXT: and a0, a2, a3 -; RV32ZBS-NEXT: addi a1, a1, -1 -; RV32ZBS-NEXT: and a1, a1, a3 +; RV32ZBS-NEXT: bset a2, zero, a0 +; RV32ZBS-NEXT: slti a0, a1, 0 +; RV32ZBS-NEXT: neg a1, a0 +; RV32ZBS-NEXT: addi a3, a0, -1 +; RV32ZBS-NEXT: and a0, a1, a2 +; RV32ZBS-NEXT: and a1, a3, a2 ; RV32ZBS-NEXT: ret %shl = shl i64 1, %a ret i64 %shl diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll index 315bf86046dff..dd49d9e3e2dce 100644 --- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll @@ -73,13 +73,13 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64I-NEXT: li a1, -449 ; RV64I-NEXT: slli a1, a1, 53 ; RV64I-NEXT: call __gedf2 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __fixdfti -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: mv s3, a1 ; RV64I-NEXT: li s5, -1 -; RV64I-NEXT: bgez s1, .LBB4_2 +; RV64I-NEXT: bgez s2, .LBB4_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: slli s3, s5, 63 ; RV64I-NEXT: .LBB4_2: @@ -97,14 +97,14 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64I-NEXT: mv a1, s0 ; RV64I-NEXT: call __unorddf2 ; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: slti a1, s2, 0 +; RV64I-NEXT: sgtz a2, s4 ; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 ; RV64I-NEXT: and a1, a0, s3 -; RV64I-NEXT: slti a2, s1, 0 -; RV64I-NEXT: addi a2, a2, -1 -; RV64I-NEXT: and a2, a2, s2 -; RV64I-NEXT: sgtz a3, s4 -; RV64I-NEXT: neg a3, a3 -; RV64I-NEXT: or a2, a3, a2 +; RV64I-NEXT: and a3, a3, s1 +; RV64I-NEXT: neg a2, a2 +; RV64I-NEXT: or a2, a2, a3 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -140,11 +140,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64ID-NEXT: srli a1, a2, 1 ; RV64ID-NEXT: .LBB4_4: ; RV64ID-NEXT: feq.d a2, fs0, fs0 -; RV64ID-NEXT: neg a2, a2 -; RV64ID-NEXT: and a1, a2, a1 ; RV64ID-NEXT: neg a3, a3 ; RV64ID-NEXT: neg a4, s0 +; RV64ID-NEXT: neg a2, a2 ; RV64ID-NEXT: and a0, a4, a0 +; RV64ID-NEXT: and a1, a2, a1 ; RV64ID-NEXT: or a0, a3, a0 ; RV64ID-NEXT: and a0, a2, a0 ; RV64ID-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -178,11 +178,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64IDINX-NEXT: srli a1, a2, 1 ; RV64IDINX-NEXT: .LBB4_4: ; RV64IDINX-NEXT: feq.d a2, s0, s0 -; RV64IDINX-NEXT: neg a2, a2 -; RV64IDINX-NEXT: and a1, a2, a1 ; RV64IDINX-NEXT: neg a3, a3 ; RV64IDINX-NEXT: neg a4, s1 +; RV64IDINX-NEXT: neg a2, a2 ; RV64IDINX-NEXT: and a0, a4, a0 +; RV64IDINX-NEXT: and a1, a2, a1 ; RV64IDINX-NEXT: or a0, a3, a0 ; RV64IDINX-NEXT: and a0, a2, a0 ; RV64IDINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -219,10 +219,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind { ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __gtdf2 ; RV64I-NEXT: sgtz a0, a0 -; RV64I-NEXT: neg a1, a0 -; RV64I-NEXT: or a0, a1, s3 -; RV64I-NEXT: and a2, s2, s1 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: and a1, s2, s1 +; RV64I-NEXT: neg a2, a0 +; RV64I-NEXT: or a0, a2, s3 +; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -245,10 +245,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind { ; RV64ID-NEXT: lui a2, %hi(.LCPI5_0) ; RV64ID-NEXT: fld fa5, %lo(.LCPI5_0)(a2) ; RV64ID-NEXT: and a0, s0, a0 +; RV64ID-NEXT: and a1, s0, a1 ; RV64ID-NEXT: flt.d a2, fa5, fs0 ; RV64ID-NEXT: neg a2, a2 ; RV64ID-NEXT: or a0, a2, a0 -; RV64ID-NEXT: and a1, s0, a1 ; RV64ID-NEXT: or a1, a2, a1 ; RV64ID-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64ID-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -270,10 +270,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind { ; RV64IDINX-NEXT: lui a2, %hi(.LCPI5_0) ; RV64IDINX-NEXT: ld a2, %lo(.LCPI5_0)(a2) ; RV64IDINX-NEXT: and a0, s1, a0 +; RV64IDINX-NEXT: and a1, s1, a1 ; RV64IDINX-NEXT: flt.d a2, a2, s0 ; RV64IDINX-NEXT: neg a2, a2 ; RV64IDINX-NEXT: or a0, a2, a0 -; RV64IDINX-NEXT: and a1, s1, a1 ; RV64IDINX-NEXT: or a1, a2, a1 ; RV64IDINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IDINX-NEXT: ld s0, 16(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll index 8ebb9433bad79..0cdd92fbaf916 100644 --- a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll @@ -133,14 +133,14 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind { ; RV64I-NEXT: mv a1, s0 ; RV64I-NEXT: call __unordsf2 ; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: slti a1, s1, 0 +; RV64I-NEXT: sgtz a2, s4 ; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: addi a3, a1, -1 ; RV64I-NEXT: and a1, a0, s3 -; RV64I-NEXT: slti a2, s1, 0 -; RV64I-NEXT: addi a2, a2, -1 -; RV64I-NEXT: and a2, a2, s2 -; RV64I-NEXT: sgtz a3, s4 -; RV64I-NEXT: neg a3, a3 -; RV64I-NEXT: or a2, a3, a2 +; RV64I-NEXT: and a3, a3, s2 +; RV64I-NEXT: neg a2, a2 +; RV64I-NEXT: or a2, a2, a3 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -176,11 +176,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind { ; RV64IF-NEXT: srli a1, a3, 1 ; RV64IF-NEXT: .LBB4_4: ; RV64IF-NEXT: feq.s a3, fs0, fs0 -; RV64IF-NEXT: neg a3, a3 -; RV64IF-NEXT: and a1, a3, a1 ; RV64IF-NEXT: neg a4, s0 -; RV64IF-NEXT: and a0, a4, a0 ; RV64IF-NEXT: neg a2, a2 +; RV64IF-NEXT: neg a3, a3 +; RV64IF-NEXT: and a0, a4, a0 +; RV64IF-NEXT: and a1, a3, a1 ; RV64IF-NEXT: or a0, a2, a0 ; RV64IF-NEXT: and a0, a3, a0 ; RV64IF-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -213,11 +213,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind { ; RV64IZFINX-NEXT: srli a1, a2, 1 ; RV64IZFINX-NEXT: .LBB4_4: ; RV64IZFINX-NEXT: feq.s a2, s0, s0 -; RV64IZFINX-NEXT: neg a2, a2 -; RV64IZFINX-NEXT: and a1, a2, a1 ; RV64IZFINX-NEXT: neg a4, s1 -; RV64IZFINX-NEXT: and a0, a4, a0 ; RV64IZFINX-NEXT: neg a3, a3 +; RV64IZFINX-NEXT: neg a2, a2 +; RV64IZFINX-NEXT: and a0, a4, a0 +; RV64IZFINX-NEXT: and a1, a2, a1 ; RV64IZFINX-NEXT: or a0, a3, a0 ; RV64IZFINX-NEXT: and a0, a2, a0 ; RV64IZFINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -252,8 +252,8 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind { ; RV64I-NEXT: sext.w a0, s0 ; RV64I-NEXT: call __fixunssfti ; RV64I-NEXT: and a0, s2, a0 -; RV64I-NEXT: or a0, s1, a0 ; RV64I-NEXT: and a1, s2, a1 +; RV64I-NEXT: or a0, s1, a0 ; RV64I-NEXT: or a1, s1, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -276,10 +276,10 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind { ; RV64IF-NEXT: lui a2, %hi(.LCPI5_0) ; RV64IF-NEXT: flw fa5, %lo(.LCPI5_0)(a2) ; RV64IF-NEXT: and a0, s0, a0 +; RV64IF-NEXT: and a1, s0, a1 ; RV64IF-NEXT: flt.s a2, fa5, fs0 ; RV64IF-NEXT: neg a2, a2 ; RV64IF-NEXT: or a0, a2, a0 -; RV64IF-NEXT: and a1, s0, a1 ; RV64IF-NEXT: or a1, a2, a1 ; RV64IF-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IF-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -300,11 +300,11 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind { ; RV64IZFINX-NEXT: call __fixunssfti ; RV64IZFINX-NEXT: and a0, s1, a0 ; RV64IZFINX-NEXT: lui a2, 522240 +; RV64IZFINX-NEXT: and a1, s1, a1 ; RV64IZFINX-NEXT: addiw a2, a2, -1 ; RV64IZFINX-NEXT: flt.s a2, a2, s0 ; RV64IZFINX-NEXT: neg a2, a2 ; RV64IZFINX-NEXT: or a0, a2, a0 -; RV64IZFINX-NEXT: and a1, s1, a1 ; RV64IZFINX-NEXT: or a1, a2, a1 ; RV64IZFINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IZFINX-NEXT: ld s0, 16(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll index 2cb2ecbd57f65..a717c6c71f2ec 100644 --- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll @@ -174,14 +174,14 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind { ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __unordsf2 ; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: sgtz a1, s4 +; RV64I-NEXT: slti a2, s0, 0 ; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: neg a3, a1 +; RV64I-NEXT: addi a2, a2, -1 ; RV64I-NEXT: and a1, a0, s3 -; RV64I-NEXT: sgtz a2, s4 -; RV64I-NEXT: neg a2, a2 -; RV64I-NEXT: slti a3, s0, 0 -; RV64I-NEXT: addi a3, a3, -1 -; RV64I-NEXT: and a3, a3, s2 -; RV64I-NEXT: or a2, a2, a3 +; RV64I-NEXT: and a2, a2, s2 +; RV64I-NEXT: or a2, a3, a2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -218,11 +218,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind { ; RV64IZFH-NEXT: srli a1, a2, 1 ; RV64IZFH-NEXT: .LBB4_4: ; RV64IZFH-NEXT: feq.s a2, fs0, fs0 -; RV64IZFH-NEXT: neg a2, a2 -; RV64IZFH-NEXT: and a1, a2, a1 ; RV64IZFH-NEXT: neg a3, a3 ; RV64IZFH-NEXT: neg a4, s0 +; RV64IZFH-NEXT: neg a2, a2 ; RV64IZFH-NEXT: and a0, a4, a0 +; RV64IZFH-NEXT: and a1, a2, a1 ; RV64IZFH-NEXT: or a0, a3, a0 ; RV64IZFH-NEXT: and a0, a2, a0 ; RV64IZFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -255,11 +255,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind { ; RV64IZHINX-NEXT: srli a1, a2, 1 ; RV64IZHINX-NEXT: .LBB4_4: ; RV64IZHINX-NEXT: feq.s a2, s0, s0 -; RV64IZHINX-NEXT: neg a2, a2 -; RV64IZHINX-NEXT: and a1, a2, a1 ; RV64IZHINX-NEXT: neg a3, a3 ; RV64IZHINX-NEXT: neg a4, s1 +; RV64IZHINX-NEXT: neg a2, a2 ; RV64IZHINX-NEXT: and a0, a4, a0 +; RV64IZHINX-NEXT: and a1, a2, a1 ; RV64IZHINX-NEXT: or a0, a3, a0 ; RV64IZHINX-NEXT: and a0, a2, a0 ; RV64IZHINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -295,8 +295,8 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind { ; RV64I-NEXT: sext.w a0, s0 ; RV64I-NEXT: call __fixunssfti ; RV64I-NEXT: and a0, s2, a0 -; RV64I-NEXT: or a0, s1, a0 ; RV64I-NEXT: and a1, s2, a1 +; RV64I-NEXT: or a0, s1, a0 ; RV64I-NEXT: or a1, s1, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -314,15 +314,15 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind { ; RV64IZFH-NEXT: lui a0, %hi(.LCPI5_0) ; RV64IZFH-NEXT: flw fa5, %lo(.LCPI5_0)(a0) ; RV64IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV64IZFH-NEXT: flt.s a0, fa5, fa0 -; RV64IZFH-NEXT: neg s0, a0 -; RV64IZFH-NEXT: fmv.w.x fa5, zero -; RV64IZFH-NEXT: fle.s a0, fa5, fa0 +; RV64IZFH-NEXT: fmv.w.x fa4, zero +; RV64IZFH-NEXT: fle.s a0, fa4, fa0 +; RV64IZFH-NEXT: flt.s a1, fa5, fa0 +; RV64IZFH-NEXT: neg s0, a1 ; RV64IZFH-NEXT: neg s1, a0 ; RV64IZFH-NEXT: call __fixunssfti ; RV64IZFH-NEXT: and a0, s1, a0 -; RV64IZFH-NEXT: or a0, s0, a0 ; RV64IZFH-NEXT: and a1, s1, a1 +; RV64IZFH-NEXT: or a0, s0, a0 ; RV64IZFH-NEXT: or a1, s0, a1 ; RV64IZFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IZFH-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -339,14 +339,14 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind { ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: lui a1, 522240 ; RV64IZHINX-NEXT: addiw a1, a1, -1 +; RV64IZHINX-NEXT: fle.s a2, zero, a0 ; RV64IZHINX-NEXT: flt.s a1, a1, a0 ; RV64IZHINX-NEXT: neg s0, a1 -; RV64IZHINX-NEXT: fle.s a1, zero, a0 -; RV64IZHINX-NEXT: neg s1, a1 +; RV64IZHINX-NEXT: neg s1, a2 ; RV64IZHINX-NEXT: call __fixunssfti ; RV64IZHINX-NEXT: and a0, s1, a0 -; RV64IZHINX-NEXT: or a0, s0, a0 ; RV64IZHINX-NEXT: and a1, s1, a1 +; RV64IZHINX-NEXT: or a0, s0, a0 ; RV64IZHINX-NEXT: or a1, s0, a1 ; RV64IZHINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IZHINX-NEXT: ld s0, 16(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll index ba18406326509..1ec4d8ddd1d84 100644 --- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll +++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll @@ -18,15 +18,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lui a0, %hi(f) ; RV64-NEXT: addi a0, a0, %lo(f) +; RV64-NEXT: li a2, 919 +; RV64-NEXT: lui a3, %hi(.LCPI0_0) ; RV64-NEXT: sd a0, 32(sp) -; RV64-NEXT: li a0, 919 -; RV64-NEXT: lui a2, %hi(.LCPI0_0) -; RV64-NEXT: ld a2, %lo(.LCPI0_0)(a2) -; RV64-NEXT: lui a3, 6203 -; RV64-NEXT: addi a3, a3, 643 -; RV64-NEXT: sw a0, 8(sp) -; RV64-NEXT: sw a3, 12(sp) -; RV64-NEXT: sd a2, 16(sp) +; RV64-NEXT: lui a0, 6203 +; RV64-NEXT: ld a3, %lo(.LCPI0_0)(a3) +; RV64-NEXT: addi a0, a0, 643 +; RV64-NEXT: sw a2, 8(sp) +; RV64-NEXT: sw a0, 12(sp) +; RV64-NEXT: sd a3, 16(sp) ; RV64-NEXT: sd a1, 24(sp) ; RV64-NEXT: addi a1, sp, 24 ; RV64-NEXT: addi a0, sp, 8 @@ -49,15 +49,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ; RV64-LINUX-NEXT: mv s0, a0 ; RV64-LINUX-NEXT: lui a0, %hi(f) ; RV64-LINUX-NEXT: addi a0, a0, %lo(f) +; RV64-LINUX-NEXT: li a2, 919 +; RV64-LINUX-NEXT: lui a3, %hi(.LCPI0_0) ; RV64-LINUX-NEXT: sd a0, 32(sp) -; RV64-LINUX-NEXT: li a0, 919 -; RV64-LINUX-NEXT: lui a2, %hi(.LCPI0_0) -; RV64-LINUX-NEXT: ld a2, %lo(.LCPI0_0)(a2) -; RV64-LINUX-NEXT: lui a3, 6203 -; RV64-LINUX-NEXT: addi a3, a3, 643 -; RV64-LINUX-NEXT: sw a0, 8(sp) -; RV64-LINUX-NEXT: sw a3, 12(sp) -; RV64-LINUX-NEXT: sd a2, 16(sp) +; RV64-LINUX-NEXT: lui a0, 6203 +; RV64-LINUX-NEXT: ld a3, %lo(.LCPI0_0)(a3) +; RV64-LINUX-NEXT: addi a0, a0, 643 +; RV64-LINUX-NEXT: sw a2, 8(sp) +; RV64-LINUX-NEXT: sw a0, 12(sp) +; RV64-LINUX-NEXT: sd a3, 16(sp) ; RV64-LINUX-NEXT: sd a1, 24(sp) ; RV64-LINUX-NEXT: addi a1, sp, 24 ; RV64-LINUX-NEXT: addi a0, sp, 8 diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll index 6c4466796aeed..b3c22a5322cb4 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll @@ -175,10 +175,10 @@ define i8 @test13(ptr %0, i64 %1) { ; RV64I-LABEL: test13: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: subw a2, a2, a1 -; RV64I-NEXT: add a2, a0, a2 ; RV64I-NEXT: li a3, 2 +; RV64I-NEXT: subw a2, a2, a1 ; RV64I-NEXT: subw a3, a3, a1 +; RV64I-NEXT: add a2, a0, a2 ; RV64I-NEXT: add a0, a0, a3 ; RV64I-NEXT: lbu a1, 0(a2) ; RV64I-NEXT: lbu a0, 0(a0) @@ -203,8 +203,8 @@ define signext i32 @test14(ptr %0, ptr %1, i64 %2) { ; RV64I-NEXT: li a3, 1 ; RV64I-NEXT: subw a3, a3, a2 ; RV64I-NEXT: add a0, a0, a3 -; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: slli a3, a3, 2 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: lw a1, 0(a1) ; RV64I-NEXT: addw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll index ab1691543c78a..0782018833de3 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll @@ -7,11 +7,11 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: bge a0, a1, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: not a2, a0 -; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: addi a3, a0, 1 -; CHECK-NEXT: mul a3, a2, a3 +; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: subw a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -2 +; CHECK-NEXT: mul a3, a2, a3 ; CHECK-NEXT: slli a1, a1, 32 ; CHECK-NEXT: slli a2, a2, 32 ; CHECK-NEXT: mulhu a1, a2, a1 @@ -53,13 +53,13 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: bge a0, a1, .LBB1_2 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: not a2, a0 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: mul a2, a3, a2 -; CHECK-NEXT: subw a1, a1, a0 -; CHECK-NEXT: addi a1, a1, -2 -; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: subw a3, a1, a0 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: addi a3, a3, -2 +; CHECK-NEXT: mul a2, a1, a2 ; CHECK-NEXT: slli a3, a3, 32 -; CHECK-NEXT: mulhu a1, a3, a1 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: mulhu a1, a1, a3 ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: subw a0, a2, a0 ; CHECK-NEXT: subw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 47c4e8beecced..d9f7d36127293 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -64,31 +64,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -125,31 +125,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 ; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 2 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 4 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 16 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: addiw a2, a3, 1365 +; RV64I-NEXT: srliw a3, a1, 2 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 4 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 8 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 16 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: srli a2, a1, 1 -; RV64I-NEXT: lui a3, 349525 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a3, a1, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a1, a3 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: srli a2, a1, 4 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: addi a2, a3, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a2, a1, 8 ; RV64I-NEXT: add a1, a1, a2 @@ -179,39 +179,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findLastSet_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a1, a0, a1 -; RV64I-NEXT: srliw a2, a1, 2 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 4 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 16 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a3, a1, 2 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 4 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 8 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 16 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: srli a2, a1, 1 -; RV64I-NEXT: lui a3, 349525 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a3, a1, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a1, a3 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a3, a3, -241 +; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: srli a2, a1, 4 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slli a2, a1, 8 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a1, 16 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: srliw a1, a1, 24 ; RV64I-NEXT: xori a1, a1, 31 -; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: addi a0, a0, -1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -219,10 +219,10 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64XTHEADBB-LABEL: findLastSet_i32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: not a1, a0 +; RV64XTHEADBB-NEXT: snez a0, a0 ; RV64XTHEADBB-NEXT: slli a1, a1, 32 ; RV64XTHEADBB-NEXT: th.ff0 a1, a1 ; RV64XTHEADBB-NEXT: xori a1, a1, 31 -; RV64XTHEADBB-NEXT: snez a0, a0 ; RV64XTHEADBB-NEXT: addi a0, a0, -1 ; RV64XTHEADBB-NEXT: or a0, a0, a1 ; RV64XTHEADBB-NEXT: ret @@ -240,31 +240,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -296,40 +296,40 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: addiw a2, a3, 819 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -456,10 +456,10 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a2, a0 ; RV64XTHEADBB-NEXT: and a1, a2, a1 -; RV64XTHEADBB-NEXT: th.ff1 a1, a1 ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: snez a0, a0 +; RV64XTHEADBB-NEXT: th.ff1 a1, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: addi a0, a0, -1 ; RV64XTHEADBB-NEXT: or a0, a0, a2 ; RV64XTHEADBB-NEXT: ret @@ -486,8 +486,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: addi a1, a1, %lo(.LCPI9_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: addi a0, a0, 1 ; RV64I-NEXT: seqz a1, s0 +; RV64I-NEXT: addi a0, a0, 1 ; RV64I-NEXT: addi a1, a1, -1 ; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -500,10 +500,10 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64XTHEADBB-NEXT: addi a1, a0, -1 ; RV64XTHEADBB-NEXT: not a2, a0 ; RV64XTHEADBB-NEXT: and a1, a2, a1 -; RV64XTHEADBB-NEXT: th.ff1 a1, a1 ; RV64XTHEADBB-NEXT: li a2, 65 -; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: seqz a0, a0 +; RV64XTHEADBB-NEXT: th.ff1 a1, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: addi a0, a0, -1 ; RV64XTHEADBB-NEXT: and a0, a0, a2 ; RV64XTHEADBB-NEXT: ret @@ -802,11 +802,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srliw a3, a0, 24 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 @@ -827,11 +827,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 +; RV64I-NEXT: srliw a4, a0, 24 ; RV64I-NEXT: addi a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 -; RV64I-NEXT: srliw a4, a0, 24 -; RV64I-NEXT: or a2, a2, a4 ; RV64I-NEXT: and a3, a0, a3 +; RV64I-NEXT: or a2, a2, a4 ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 @@ -856,28 +856,28 @@ define i64 @bswap_i64(i64 %a) { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srli a3, a0, 56 +; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: lui a5, 4080 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: lui a4, 4080 -; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: srli a5, a0, 8 -; RV64I-NEXT: srliw a5, a5, 24 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: and a4, a0, a4 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: srliw a3, a3, 24 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: and a5, a0, a5 ; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a2, a2, 40 +; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 07726b643b51a..9760821832b37 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2656,8 +2656,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) { ; RV64I-LABEL: array_index_lshr_sh3_sh3: ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a1, 58 -; RV64I-NEXT: slli a1, a1, 6 ; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: slli a1, a1, 6 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ld a0, 0(a0) @@ -2759,8 +2759,8 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) { ; RV64I-LABEL: test_gep_gep_dont_crash: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a2, a2, 6 -; RV64I-NEXT: slli a2, a2, 3 ; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: slli a2, a2, 3 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll index a7af8ab348e99..3f984deccfb2c 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll @@ -59,12 +59,12 @@ define i64 @orcb64_knownbits(i64 %a) nounwind { ; RV64ZBB-LABEL: orcb64_knownbits: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: lui a1, 65535 +; RV64ZBB-NEXT: lui a2, 256 ; RV64ZBB-NEXT: slli a1, a1, 12 +; RV64ZBB-NEXT: addiw a2, a2, 8 ; RV64ZBB-NEXT: and a0, a0, a1 -; RV64ZBB-NEXT: lui a1, 256 -; RV64ZBB-NEXT: addiw a1, a1, 8 -; RV64ZBB-NEXT: slli a2, a1, 42 -; RV64ZBB-NEXT: add a1, a1, a2 +; RV64ZBB-NEXT: slli a1, a2, 42 +; RV64ZBB-NEXT: add a1, a2, a1 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: orc.b a0, a0 ; RV64ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll index d9afb7c00ce58..bf077364c9c7a 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll @@ -146,10 +146,10 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: sllw a2, a1, a0 -; RV64I-NEXT: negw a0, a0 -; RV64I-NEXT: srlw a0, a1, a0 -; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: sllw a0, a1, a0 +; RV64I-NEXT: srlw a1, a1, a2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-ZBKB-LABEL: rol_i32_neg_constant_rhs: @@ -224,10 +224,10 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: srlw a2, a1, a0 -; RV64I-NEXT: negw a0, a0 -; RV64I-NEXT: sllw a0, a1, a0 -; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: srlw a0, a1, a0 +; RV64I-NEXT: sllw a1, a1, a2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-ZBKB-LABEL: ror_i32_neg_constant_rhs: diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 1e7814d588e4c..d67db77c04a8e 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -62,31 +62,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -121,31 +121,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 ; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 2 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 4 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 16 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: addiw a2, a3, 1365 +; RV64I-NEXT: srliw a3, a1, 2 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 4 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 8 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 16 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: srli a2, a1, 1 -; RV64I-NEXT: lui a3, 349525 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a3, a1, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a1, a3 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: srli a2, a1, 4 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: addi a2, a3, -241 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: slli a2, a1, 8 ; RV64I-NEXT: add a1, a1, a2 @@ -173,39 +173,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findLastSet_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a1, a0, a1 -; RV64I-NEXT: srliw a2, a1, 2 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 4 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: srliw a2, a1, 16 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a3, a1, 2 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 4 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 8 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: srliw a3, a1, 16 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: srli a2, a1, 1 -; RV64I-NEXT: lui a3, 349525 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: srli a3, a1, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a1, a3 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a3, a3, -241 +; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: srli a2, a1, 4 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: slli a2, a1, 8 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a1, 16 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: srliw a1, a1, 24 ; RV64I-NEXT: xori a1, a1, 31 -; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: addi a0, a0, -1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -213,8 +213,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64ZBB-LABEL: findLastSet_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: clzw a1, a0 -; RV64ZBB-NEXT: xori a1, a1, 31 ; RV64ZBB-NEXT: snez a0, a0 +; RV64ZBB-NEXT: xori a1, a1, 31 ; RV64ZBB-NEXT: addi a0, a0, -1 ; RV64ZBB-NEXT: or a0, a0, a1 ; RV64ZBB-NEXT: ret @@ -232,31 +232,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srliw a2, a0, 2 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 4 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -286,40 +286,40 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: addiw a2, a3, 819 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -456,8 +456,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: addi a1, a1, %lo(.LCPI9_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: addi a0, a0, 1 ; RV64I-NEXT: seqz a1, s0 +; RV64I-NEXT: addi a0, a0, 1 ; RV64I-NEXT: addi a1, a1, -1 ; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -468,8 +468,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64ZBB-LABEL: ffs_i32: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: ctzw a1, a0 -; RV64ZBB-NEXT: addi a1, a1, 1 ; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: addi a1, a1, 1 ; RV64ZBB-NEXT: addi a0, a0, -1 ; RV64ZBB-NEXT: and a0, a0, a1 ; RV64ZBB-NEXT: ret @@ -523,17 +523,17 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -630,21 +630,21 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-LABEL: ctpop_i32_load: ; RV64I: # %bb.0: ; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi a1, a1, -241 +; RV64I-NEXT: addi a1, a2, -241 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 @@ -670,39 +670,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a2, a0, 1 ; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: lui a4, 209715 +; RV64I-NEXT: srli a5, a1, 1 ; RV64I-NEXT: addiw a3, a3, 1365 ; RV64I-NEXT: and a2, a2, a3 -; RV64I-NEXT: sub a0, a0, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a4, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: add a0, a4, a0 -; RV64I-NEXT: srli a4, a0, 4 -; RV64I-NEXT: add a0, a0, a4 -; RV64I-NEXT: lui a4, 61681 -; RV64I-NEXT: addi a4, a4, -241 -; RV64I-NEXT: and a0, a0, a4 -; RV64I-NEXT: slli a5, a0, 8 -; RV64I-NEXT: add a0, a0, a5 -; RV64I-NEXT: slli a5, a0, 16 -; RV64I-NEXT: add a0, a0, a5 -; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: srli a5, a1, 1 ; RV64I-NEXT: and a3, a5, a3 +; RV64I-NEXT: lui a5, 61681 +; RV64I-NEXT: addiw a4, a4, 819 +; RV64I-NEXT: addi a5, a5, -241 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a0, a4 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a3, a1, a4 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 -; RV64I-NEXT: srli a2, a1, 4 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: and a0, a0, a4 ; RV64I-NEXT: and a1, a1, a4 -; RV64I-NEXT: slli a2, a1, 8 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: slli a2, a1, 16 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: srli a3, a1, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: and a0, a0, a5 +; RV64I-NEXT: and a1, a1, a5 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: srliw a1, a1, 24 ; RV64I-NEXT: ret ; @@ -720,11 +720,11 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: ret ; @@ -745,11 +745,11 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: snez a1, a1 ; RV64I-NEXT: ret ; @@ -758,8 +758,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { ; RV64ZBB-NEXT: cpopw a1, a1 ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: sltiu a0, a0, 2 -; RV64ZBB-NEXT: xori a0, a0, 1 ; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 ; RV64ZBB-NEXT: xori a1, a1, 1 ; RV64ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -785,8 +785,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { ; RV64ZBB-NEXT: cpopw a1, a1 ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: addi a0, a0, -1 -; RV64ZBB-NEXT: seqz a0, a0 ; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: seqz a0, a0 ; RV64ZBB-NEXT: seqz a1, a1 ; RV64ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -801,11 +801,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV64I-NEXT: xor a0, a0, a2 ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: addiw a2, a1, -1 ; RV64I-NEXT: xor a1, a1, a2 ; RV64I-NEXT: sext.w a1, a1 ; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: xori a1, a1, 1 ; RV64I-NEXT: ret ; @@ -814,8 +814,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { ; RV64ZBB-NEXT: cpopw a1, a1 ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: addi a0, a0, -1 -; RV64ZBB-NEXT: snez a0, a0 ; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: snez a0, a0 ; RV64ZBB-NEXT: snez a1, a1 ; RV64ZBB-NEXT: ret %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) @@ -828,28 +828,28 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-LABEL: ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: slli a3, a2, 32 ; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: addiw a3, a3, -241 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: slli a2, a3, 32 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: add a2, a3, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: slli a1, a0, 8 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 16 @@ -950,49 +950,49 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a2, a0, 1 ; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: lui a4, 209715 +; RV64I-NEXT: lui a5, 61681 ; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: slli a4, a3, 32 -; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: addiw a4, a4, 819 +; RV64I-NEXT: addiw a5, a5, -241 +; RV64I-NEXT: slli a6, a3, 32 +; RV64I-NEXT: add a3, a3, a6 +; RV64I-NEXT: slli a6, a4, 32 +; RV64I-NEXT: add a4, a4, a6 +; RV64I-NEXT: slli a6, a5, 32 +; RV64I-NEXT: add a5, a5, a6 +; RV64I-NEXT: srli a6, a1, 1 ; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: and a3, a6, a3 ; RV64I-NEXT: sub a0, a0, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a4, a2, 32 -; RV64I-NEXT: add a2, a2, a4 -; RV64I-NEXT: and a4, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: add a0, a4, a0 -; RV64I-NEXT: srli a4, a0, 4 -; RV64I-NEXT: add a0, a0, a4 -; RV64I-NEXT: lui a4, 61681 -; RV64I-NEXT: addiw a4, a4, -241 -; RV64I-NEXT: slli a5, a4, 32 -; RV64I-NEXT: add a4, a4, a5 -; RV64I-NEXT: and a0, a0, a4 -; RV64I-NEXT: slli a5, a0, 8 -; RV64I-NEXT: add a0, a0, a5 -; RV64I-NEXT: slli a5, a0, 16 -; RV64I-NEXT: add a0, a0, a5 -; RV64I-NEXT: slli a5, a0, 32 -; RV64I-NEXT: add a0, a0, a5 -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: srli a5, a1, 1 -; RV64I-NEXT: and a3, a5, a3 ; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a0, a4 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a3, a1, a4 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 -; RV64I-NEXT: srli a2, a1, 4 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: and a0, a0, a4 ; RV64I-NEXT: and a1, a1, a4 -; RV64I-NEXT: slli a2, a1, 8 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: slli a2, a1, 16 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: srli a3, a1, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: and a0, a0, a5 +; RV64I-NEXT: and a1, a1, a5 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a2, a0, 32 +; RV64I-NEXT: slli a3, a1, 32 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: ret ; @@ -1010,9 +1010,9 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: ret ; @@ -1033,9 +1033,9 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: snez a1, a1 ; RV64I-NEXT: ret ; @@ -1044,8 +1044,8 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV64ZBB-NEXT: cpop a1, a1 ; RV64ZBB-NEXT: cpop a0, a0 ; RV64ZBB-NEXT: sltiu a0, a0, 2 -; RV64ZBB-NEXT: xori a0, a0, 1 ; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 ; RV64ZBB-NEXT: xori a1, a1, 1 ; RV64ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -1069,8 +1069,8 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; RV64ZBB-NEXT: cpop a1, a1 ; RV64ZBB-NEXT: cpop a0, a0 ; RV64ZBB-NEXT: addi a0, a0, -1 -; RV64ZBB-NEXT: seqz a0, a0 ; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: seqz a0, a0 ; RV64ZBB-NEXT: seqz a1, a1 ; RV64ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -1084,10 +1084,10 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: xor a0, a0, a2 ; RV64I-NEXT: sltu a0, a2, a0 -; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: xor a1, a1, a2 ; RV64I-NEXT: sltu a1, a2, a1 +; RV64I-NEXT: xori a0, a0, 1 ; RV64I-NEXT: xori a1, a1, 1 ; RV64I-NEXT: ret ; @@ -1096,8 +1096,8 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; RV64ZBB-NEXT: cpop a1, a1 ; RV64ZBB-NEXT: cpop a0, a0 ; RV64ZBB-NEXT: addi a0, a0, -1 -; RV64ZBB-NEXT: snez a0, a0 ; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: snez a0, a0 ; RV64ZBB-NEXT: snez a1, a1 ; RV64ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) @@ -1406,11 +1406,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srliw a3, a0, 24 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 @@ -1432,11 +1432,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 +; RV64I-NEXT: srliw a4, a0, 24 ; RV64I-NEXT: addi a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 -; RV64I-NEXT: srliw a4, a0, 24 -; RV64I-NEXT: or a2, a2, a4 ; RV64I-NEXT: and a3, a0, a3 +; RV64I-NEXT: or a2, a2, a4 ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 @@ -1462,28 +1462,28 @@ define i64 @bswap_i64(i64 %a) { ; RV64I: # %bb.0: ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: srli a3, a0, 56 +; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: lui a5, 4080 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: lui a4, 4080 -; RV64I-NEXT: and a3, a3, a4 -; RV64I-NEXT: srli a5, a0, 8 -; RV64I-NEXT: srliw a5, a5, 24 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: and a4, a0, a4 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: srliw a3, a0, 24 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: srliw a3, a3, 24 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: and a5, a0, a5 ; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a2, a2, 40 +; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll index 4aa6cd42ab099..985837d05caa2 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll @@ -275,8 +275,8 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) { ; RV64I-LABEL: pack_i64_allWUsers: ; RV64I: # %bb.0: ; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a2, a2, 32 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll index da477aa2043cf..a6ef184abe5e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll +++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll @@ -25,11 +25,9 @@ define void @foo( %0) { ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 0, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v9, v10, 0 +; CHECK-NEXT: vslideup.vi v8, v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.x.s s1, v9 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.x.s s2, v8 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mv a0, s1 diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll index 74693e655bf03..163d9145bc362 100644 --- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll @@ -575,12 +575,12 @@ define @vp_abs_nxv16i64( %va, @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v24, v16 -; CHECK-NEXT: vsaddu.vx v16, v24, a1 -; CHECK-NEXT: vmsltu.vx v9, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vle8.v v17, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI9_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v18, (a0) +; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v16 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v17 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v17, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v9, 2 +; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v10, 4 +; CHECK-NEXT: vslideup.vi v0, v17, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 +; CHECK-NEXT: vsext.vf8 v8, v18 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 6 @@ -163,65 +163,60 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v9 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vle8.v v17, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vle8.v v18, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) -; CHECK-NEXT: vle8.v v11, (a0) -; CHECK-NEXT: vsext.vf8 v16, v9 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v9, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v11 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v11, v16, a2 -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vle8.v v19, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vmsltu.vx v0, v16, a2 +; CHECK-NEXT: vle8.v v20, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) -; CHECK-NEXT: vle8.v v13, (a0) -; CHECK-NEXT: vsext.vf8 v16, v12 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v12, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v13 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v13, v16, a2 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v8, 2 -; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vle8.v v21, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 6 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v22, (a0) +; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v16 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v17 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v17, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v18 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v18, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v19 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v19, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v20 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v20, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v21 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v21, v8, a2 +; CHECK-NEXT: vsext.vf8 v8, v22 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v22, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v12, 2 +; CHECK-NEXT: vslideup.vi v17, v16, 2 +; CHECK-NEXT: vslideup.vi v0, v20, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v0, v13, 4 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v8, v16, a2 +; CHECK-NEXT: vslideup.vi v17, v18, 4 +; CHECK-NEXT: vslideup.vi v0, v21, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v0, v8, 6 +; CHECK-NEXT: vslideup.vi v17, v19, 6 +; CHECK-NEXT: vslideup.vi v0, v22, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v0, v10, 8 +; CHECK-NEXT: vslideup.vi v0, v17, 8 ; CHECK-NEXT: ret %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc) ret <128 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll index abe1920e43784..9ac2775d30668 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll @@ -16,18 +16,18 @@ define void @test(ptr %addr) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb ; CHECK-NEXT: csrrs a1, vlenb, zero -; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vl1re64.v v8, (a2) +; CHECK-NEXT: vl1re64.v v8, (a0) ; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: vl1re64.v v9, (a0) -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: vl1re64.v v10, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v9, (a0) -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vs1r.v v10, (a2) +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vl1re64.v v9, (a3) +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re64.v v10, (a0) +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vs1r.v v8, (a3) +; CHECK-NEXT: vs1r.v v9, (a2) +; CHECK-NEXT: vs1r.v v10, (a1) ; CHECK-NEXT: csrrs a0, vlenb, zero ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll index 9790339667915..fb25d4e15e40e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll @@ -20,8 +20,8 @@ define @test(ptr %addr, i64 %vl) { ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: vl1re64.v v9, (a0) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) ; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vs1r.v v8, (a0) ; CHECK-NEXT: vs1r.v v9, (a2) ; CHECK-NEXT: vl1re64.v v8, (a2) ; CHECK-NEXT: vl1re64.v v9, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll index 4cd1b045529e3..853f937bbd230 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll @@ -14,8 +14,8 @@ define target("riscv.vector.tuple", , 5) @load_store_m1x5(targe ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) ; CHECK-NEXT: csrrs a1, vlenb, zero +; CHECK-NEXT: vs1r.v v8, (a0) ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: vs1r.v v9, (a2) ; CHECK-NEXT: add a3, a2, a1 @@ -57,8 +57,8 @@ define target("riscv.vector.tuple", , 2) @load_store_m2x2(targ ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: csrrs a1, vlenb, zero +; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a1) @@ -92,8 +92,8 @@ define target("riscv.vector.tuple", , 2) @load_store_m4x2(targ ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: csrrs a1, vlenb, zero +; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vs4r.v v12, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll index 43be8feece23c..7fe6bd24a2552 100644 --- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll +++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll @@ -774,8 +774,8 @@ define void @lmul_16_align() nounwind { ; NOZBA-NEXT: csrr a0, vlenb ; NOZBA-NEXT: add a0, sp, a0 ; NOZBA-NEXT: addi a0, a0, 128 -; NOZBA-NEXT: vs8r.v v8, (a0) ; NOZBA-NEXT: csrr a1, vlenb +; NOZBA-NEXT: vs8r.v v8, (a0) ; NOZBA-NEXT: slli a1, a1, 3 ; NOZBA-NEXT: add a0, a0, a1 ; NOZBA-NEXT: vs8r.v v8, (a0) @@ -805,8 +805,8 @@ define void @lmul_16_align() nounwind { ; ZBA-NEXT: csrr a0, vlenb ; ZBA-NEXT: add a0, sp, a0 ; ZBA-NEXT: addi a0, a0, 128 -; ZBA-NEXT: vs8r.v v8, (a0) ; ZBA-NEXT: csrr a1, vlenb +; ZBA-NEXT: vs8r.v v8, (a0) ; ZBA-NEXT: sh3add a0, a1, a0 ; ZBA-NEXT: vs8r.v v8, (a0) ; ZBA-NEXT: vsetvli a0, zero, e64, m1, ta, ma @@ -837,8 +837,8 @@ define void @lmul_16_align() nounwind { ; NOMUL-NEXT: csrr a0, vlenb ; NOMUL-NEXT: add a0, sp, a0 ; NOMUL-NEXT: addi a0, a0, 128 -; NOMUL-NEXT: vs8r.v v8, (a0) ; NOMUL-NEXT: csrr a1, vlenb +; NOMUL-NEXT: vs8r.v v8, (a0) ; NOMUL-NEXT: slli a1, a1, 3 ; NOMUL-NEXT: add a0, a0, a1 ; NOMUL-NEXT: vs8r.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll index 29d19ed38bbed..1ed84316d4484 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll @@ -10,17 +10,17 @@ define @bitreverse_nxv1i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-NEXT: vsll.vi v9, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -41,17 +41,17 @@ define @bitreverse_nxv2i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vsll.vi v9, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -72,17 +72,17 @@ define @bitreverse_nxv4i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vsll.vi v9, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -103,17 +103,17 @@ define @bitreverse_nxv8i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vsll.vi v9, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -134,17 +134,17 @@ define @bitreverse_nxv16i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vsll.vi v10, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -165,17 +165,17 @@ define @bitreverse_nxv32i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsll.vi v12, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -196,17 +196,17 @@ define @bitreverse_nxv64i8( %va) { ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsll.vi v16, v8, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -227,26 +227,26 @@ define @bitreverse_nxv1i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -267,26 +267,26 @@ define @bitreverse_nxv2i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -307,26 +307,26 @@ define @bitreverse_nxv4i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -347,26 +347,26 @@ define @bitreverse_nxv8i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -387,26 +387,26 @@ define @bitreverse_nxv16i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -427,26 +427,26 @@ define @bitreverse_nxv32i16( %va) { ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -467,34 +467,34 @@ define @bitreverse_nxv1i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -515,34 +515,34 @@ define @bitreverse_nxv2i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -563,34 +563,34 @@ define @bitreverse_nxv4i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 -; CHECK-NEXT: vand.vx v12, v8, a0 -; CHECK-NEXT: vsll.vi v12, v12, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vsll.vi v12, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -611,34 +611,34 @@ define @bitreverse_nxv8i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 -; CHECK-NEXT: vand.vx v16, v8, a0 -; CHECK-NEXT: vsll.vi v16, v16, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vsll.vi v16, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -659,34 +659,34 @@ define @bitreverse_nxv16i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: vor.vv v16, v16, v24 -; CHECK-NEXT: vand.vx v24, v8, a0 -; CHECK-NEXT: vsll.vi v24, v24, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v24 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vsll.vi v24, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -707,65 +707,65 @@ define @bitreverse_nxv1i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: li a1, 40 ; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v10, a2 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsrl.vi v10, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v11, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v10, v10, a3 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v12, v8, a1 +; RV32-NEXT: vand.vx v11, v11, a0 +; RV32-NEXT: vlse64.v v13, (a5), zero +; RV32-NEXT: vor.vv v10, v11, v10 +; RV32-NEXT: vand.vx v11, v8, a0 +; RV32-NEXT: vsll.vx v11, v11, a2 +; RV32-NEXT: vor.vv v11, v12, v11 ; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v12, v12, v11 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsll.vx v10, v8, a0 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: vsll.vx v12, v12, a1 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: vand.vx v12, v8, a3 -; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vand.vx v9, v9, a4 +; RV32-NEXT: vand.vv v12, v12, v13 +; RV32-NEXT: vor.vv v9, v12, v9 ; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: vand.vv v12, v8, v13 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -774,60 +774,60 @@ define @bitreverse_nxv1i64( %va) { ; ; RV64-LABEL: bitreverse_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v10, v8, a1 +; RV64-NEXT: li a1, 56 +; RV64-NEXT: li a0, 40 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v10, v10, a2 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vsetvli a3, zero, e64, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v10, v10, a3 +; RV64-NEXT: vsrl.vx v10, v8, a1 +; RV64-NEXT: vsrl.vx v11, v8, a0 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v11, v11, a2 +; RV64-NEXT: vor.vv v10, v11, v10 ; RV64-NEXT: vsrl.vi v11, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v9, v9, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v11, v11, a4 +; RV64-NEXT: vor.vv v9, v11, v9 +; RV64-NEXT: vand.vx v11, v8, a3 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a4 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: vsll.vi v11, v11, 24 +; RV64-NEXT: vsll.vi v10, v10, 8 ; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vand.vx v10, v8, a3 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: vand.vx v11, v8, a4 -; RV64-NEXT: vsll.vi v11, v11, 8 -; RV64-NEXT: vor.vv v10, v10, v11 -; RV64-NEXT: vsll.vx v11, v8, a0 +; RV64-NEXT: vsll.vx v11, v8, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vsll.vx v8, v8, a1 +; RV64-NEXT: slli a2, a3, 32 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: slli a0, a4, 32 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a0, a4, a0 +; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: vor.vv v8, v11, v8 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v9, v9, a2 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: ret @@ -848,65 +848,65 @@ define @bitreverse_nxv2i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v10, v8, a0 -; RV32-NEXT: li a1, 40 -; RV32-NEXT: vsrl.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v12, a2 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v14, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v12, v12, a3 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v14 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsll.vx v12, v8, a0 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: vsll.vx v16, v16, a1 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: vand.vx v16, v8, a3 -; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v18, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vlse64.v v14, (a5), zero +; RV32-NEXT: vor.vv v12, v12, v10 +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsll.vx v10, v10, a2 +; RV32-NEXT: vor.vv v10, v18, v10 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: vand.vv v18, v18, v14 +; RV32-NEXT: vor.vv v16, v18, v16 ; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: vand.vv v14, v8, v14 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v16, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vsll.vi v14, v14, 8 +; RV32-NEXT: vor.vv v8, v8, v14 +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v14, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a2 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsll.vi v8, v8, 4 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vand.vv v12, v12, v14 +; RV32-NEXT: vsll.vi v8, v8, 2 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v12, v10 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -915,60 +915,60 @@ define @bitreverse_nxv2i64( %va) { ; ; RV64-LABEL: bitreverse_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v12, v8, a1 +; RV64-NEXT: li a1, 56 +; RV64-NEXT: li a0, 40 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v12, v12, a2 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v12, v12, a3 +; RV64-NEXT: vsrl.vx v12, v8, a1 +; RV64-NEXT: vsrl.vx v14, v8, a0 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v14, v14, a2 +; RV64-NEXT: vor.vv v12, v14, v12 ; RV64-NEXT: vsrl.vi v14, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v10, v10, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v14, v14, a4 +; RV64-NEXT: vor.vv v10, v14, v10 +; RV64-NEXT: vand.vx v14, v8, a3 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: vand.vx v12, v8, a4 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: vsll.vi v14, v14, 24 +; RV64-NEXT: vsll.vi v12, v12, 8 ; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vand.vx v12, v8, a3 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: vand.vx v14, v8, a4 -; RV64-NEXT: vsll.vi v14, v14, 8 -; RV64-NEXT: vor.vv v12, v12, v14 -; RV64-NEXT: vsll.vx v14, v8, a0 +; RV64-NEXT: vsll.vx v14, v8, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vsll.vx v8, v8, a1 +; RV64-NEXT: slli a2, a3, 32 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: slli a0, a4, 32 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a0, a4, a0 +; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: vor.vv v8, v14, v8 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: ret @@ -989,65 +989,65 @@ define @bitreverse_nxv4i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vx v12, v8, a0 -; RV32-NEXT: li a1, 40 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v20, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v16, v16, a3 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v20 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: vsll.vx v24, v24, a1 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: vand.vx v24, v8, a3 -; RV32-NEXT: vsll.vi v24, v24, 24 -; RV32-NEXT: vand.vv v8, v8, v20 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v28, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vlse64.v v20, (a5), zero +; RV32-NEXT: vor.vv v16, v16, v12 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vx v12, v12, a2 +; RV32-NEXT: vor.vv v12, v28, v12 +; RV32-NEXT: vsrl.vi v28, v8, 8 +; RV32-NEXT: vand.vx v24, v24, a4 +; RV32-NEXT: vand.vv v28, v28, v20 +; RV32-NEXT: vor.vv v24, v28, v24 ; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: vand.vv v20, v8, v20 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v24, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vsll.vi v20, v20, 8 +; RV32-NEXT: vor.vv v8, v8, v20 +; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v20, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v12, a2 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsll.vi v8, v8, 4 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vand.vv v16, v16, v20 +; RV32-NEXT: vsll.vi v8, v8, 2 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v16, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1056,60 +1056,60 @@ define @bitreverse_nxv4i64( %va) { ; ; RV64-LABEL: bitreverse_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vx v12, v8, a0 -; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v16, v8, a1 +; RV64-NEXT: li a1, 56 +; RV64-NEXT: li a0, 40 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v16, v16, a2 -; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsetvli a3, zero, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vsrl.vx v12, v8, a1 +; RV64-NEXT: vsrl.vx v20, v8, a0 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v20, v20, a2 +; RV64-NEXT: vor.vv v12, v20, v12 ; RV64-NEXT: vsrl.vi v20, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v16, v16, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v20, v20, a4 -; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vor.vv v20, v20, v16 ; RV64-NEXT: vand.vx v16, v8, a3 -; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: vor.vv v12, v20, v12 ; RV64-NEXT: vand.vx v20, v8, a4 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: vsll.vi v16, v16, 24 ; RV64-NEXT: vsll.vi v20, v20, 8 ; RV64-NEXT: vor.vv v16, v16, v20 -; RV64-NEXT: vsll.vx v20, v8, a0 +; RV64-NEXT: vsll.vx v20, v8, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vsll.vx v8, v8, a1 +; RV64-NEXT: slli a2, a3, 32 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: slli a0, a4, 32 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a0, a4, a0 +; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: vor.vv v8, v20, v8 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v12, v12, a2 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v12, v12, a1 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: ret @@ -1130,80 +1130,87 @@ define @bitreverse_nxv8i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a0 -; RV32-NEXT: li a1, 40 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v24, a2 +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v0, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a0 ; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v0, v0, a3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vx v16, v16, a2 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v0, (a5), zero +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a4 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a2 -; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v24, v8, a0 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vand.vx v8, v8, a3 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v24 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a0 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -1212,60 +1219,60 @@ define @bitreverse_nxv8i64( %va) { ; ; RV64-LABEL: bitreverse_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vx v16, v8, a0 -; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v24, v8, a1 +; RV64-NEXT: li a1, 56 +; RV64-NEXT: li a0, 40 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsrl.vx v16, v8, a1 +; RV64-NEXT: vsrl.vx v0, v8, a0 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v0, v0, a2 +; RV64-NEXT: vor.vv v16, v0, v16 ; RV64-NEXT: vsrl.vi v0, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v0, v0, v24 ; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsll.vi v24, v24, 24 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: vor.vv v16, v0, v16 ; RV64-NEXT: vand.vx v0, v8, a4 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: vsll.vi v24, v24, 24 ; RV64-NEXT: vsll.vi v0, v0, 8 ; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vsll.vx v0, v8, a0 +; RV64-NEXT: vsll.vx v0, v8, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vsll.vx v8, v8, a1 +; RV64-NEXT: slli a2, a3, 32 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: slli a0, a4, 32 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a0, a4, a0 +; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: vor.vv v8, v0, v8 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll index 8abe35bf1d97e..66a1178cddb66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll @@ -15,18 +15,18 @@ define @vp_bitreverse_nxv1i8( %va, @vp_bitreverse_nxv1i8_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -81,18 +81,18 @@ define @vp_bitreverse_nxv2i8( %va, @vp_bitreverse_nxv2i8_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -147,18 +147,18 @@ define @vp_bitreverse_nxv4i8( %va, @vp_bitreverse_nxv4i8_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -213,18 +213,18 @@ define @vp_bitreverse_nxv8i8( %va, @vp_bitreverse_nxv8i8_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -279,18 +279,18 @@ define @vp_bitreverse_nxv16i8( %va, @vp_bitreverse_nxv16i8_unmasked( %va ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vand.vi v10, v8, 15 -; CHECK-NEXT: vsll.vi v10, v10, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v10, v10, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -345,18 +345,18 @@ define @vp_bitreverse_nxv32i8( %va, @vp_bitreverse_nxv32i8_unmasked( %va ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vand.vi v12, v8, 15 -; CHECK-NEXT: vsll.vi v12, v12, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v12, v12, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -411,18 +411,18 @@ define @vp_bitreverse_nxv64i8( %va, @vp_bitreverse_nxv64i8_unmasked( %va ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vand.vi v16, v8, 15 -; CHECK-NEXT: vsll.vi v16, v16, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v16, v16, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -477,25 +477,25 @@ define @vp_bitreverse_nxv1i16( %va, @vp_bitreverse_nxv1i16_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -557,25 +557,25 @@ define @vp_bitreverse_nxv2i16( %va, @vp_bitreverse_nxv2i16_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -637,25 +637,25 @@ define @vp_bitreverse_nxv4i16( %va, @vp_bitreverse_nxv4i16_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -717,25 +717,25 @@ define @vp_bitreverse_nxv8i16( %va, @vp_bitreverse_nxv8i16_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -797,25 +797,25 @@ define @vp_bitreverse_nxv16i16( %va, @vp_bitreverse_nxv16i16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -877,25 +877,25 @@ define @vp_bitreverse_nxv32i16( %va, @vp_bitreverse_nxv32i16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -963,27 +963,27 @@ define @vp_bitreverse_nxv1i32( %va, @vp_bitreverse_nxv1i32_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -1059,27 +1059,27 @@ define @vp_bitreverse_nxv2i32( %va, @vp_bitreverse_nxv2i32_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -1155,27 +1155,27 @@ define @vp_bitreverse_nxv4i32( %va, @vp_bitreverse_nxv4i32_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 -; CHECK-NEXT: vand.vx v12, v8, a0 -; CHECK-NEXT: vsll.vi v12, v12, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vsll.vi v12, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -1251,27 +1251,27 @@ define @vp_bitreverse_nxv8i32( %va, @vp_bitreverse_nxv8i32_unmasked( %va ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 -; CHECK-NEXT: vand.vx v16, v8, a0 -; CHECK-NEXT: vsll.vi v16, v16, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vsll.vi v16, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -1347,27 +1347,27 @@ define @vp_bitreverse_nxv16i32( %va, @vp_bitreverse_nxv16i32_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: vor.vv v16, v16, v24 -; CHECK-NEXT: vand.vx v24, v8, a0 -; CHECK-NEXT: vsll.vi v24, v24, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v24 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vsll.vi v24, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v16, v16, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: ret @@ -1437,68 +1437,67 @@ define @vp_bitreverse_nxv1i64( %va, @vp_bitreverse_nxv1i64( %va, @vp_bitreverse_nxv1i64_unmasked( %va ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v9, v10 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4 -; RV32-NEXT: vsll.vi v11, v11, 24 -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v9, v11 -; RV32-NEXT: vsrl.vx v11, v8, a1 -; RV32-NEXT: vsrl.vx v12, v8, a3 -; RV32-NEXT: vand.vx v12, v12, a2 +; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vand.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a1 ; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v11 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v12, (a6), zero +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v10, v10, v13 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vv v13, v13, v12 +; RV32-NEXT: vor.vv v9, v13, v9 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1650,59 +1649,59 @@ define @vp_bitreverse_nxv1i64_unmasked( %va ; RV64-LABEL: vp_bitreverse_nxv1i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsll.vi v9, v9, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vor.vv v9, v9, v10 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v10, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4 -; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vsrl.vi v9, v8, 24 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v11, v8, a3 +; RV64-NEXT: vsrl.vx v12, v8, a5 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vx v10, v8, a2 -; RV64-NEXT: vsrl.vx v11, v8, a4 -; RV64-NEXT: vand.vx v11, v11, a3 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsrl.vi v11, v8, 24 -; RV64-NEXT: vand.vx v11, v11, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v11 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v9, v9, v11 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v9, v9, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: ret @@ -1723,68 +1722,67 @@ define @vp_bitreverse_nxv2i64( %va, @vp_bitreverse_nxv2i64( %va, @vp_bitreverse_nxv2i64_unmasked( %va ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v14, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vand.vx v18, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vor.vv v10, v16, v10 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4 -; RV32-NEXT: vsll.vi v14, v14, 24 -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v10, v14 -; RV32-NEXT: vsrl.vx v14, v8, a1 -; RV32-NEXT: vsrl.vx v16, v8, a3 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v14 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vsll.vx v18, v18, a4 +; RV32-NEXT: vor.vv v12, v12, v18 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v14, v14, a5 +; RV32-NEXT: vand.vv v18, v18, v16 +; RV32-NEXT: vor.vv v14, v18, v14 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsetvli a4, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vor.vv v10, v14, v10 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v14, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v10, v10, v16 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vand.vv v10, v10, v14 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1936,59 +1934,59 @@ define @vp_bitreverse_nxv2i64_unmasked( %va ; RV64-LABEL: vp_bitreverse_nxv2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vor.vv v10, v10, v12 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v12, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4 -; RV64-NEXT: vor.vv v12, v12, v14 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vx v12, v8, a2 -; RV64-NEXT: vsrl.vx v14, v8, a4 -; RV64-NEXT: vand.vx v14, v14, a3 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsrl.vi v14, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v10, v8, a3 +; RV64-NEXT: vsrl.vx v16, v8, a5 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vor.vv v10, v16, v10 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: vand.vx v14, v14, a2 ; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsrl.vi v14, v8, 24 -; RV64-NEXT: vand.vx v14, v14, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v14, v8, a2 +; RV64-NEXT: vsll.vi v14, v14, 8 +; RV64-NEXT: vor.vv v14, v16, v14 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v14 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: ret @@ -2009,70 +2007,69 @@ define @vp_bitreverse_nxv4i64( %va, @vp_bitreverse_nxv4i64( %va, @vp_bitreverse_nxv4i64_unmasked( %va ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v20, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vand.vx v28, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a1 +; RV32-NEXT: vor.vv v12, v24, v12 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4 -; RV32-NEXT: vsll.vi v20, v20, 24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vor.vv v12, v12, v20 -; RV32-NEXT: vsrl.vx v20, v8, a1 -; RV32-NEXT: vsrl.vx v24, v8, a3 -; RV32-NEXT: vand.vx v24, v24, a2 -; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v24, v8, 24 -; RV32-NEXT: vand.vx v24, v24, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v8, v20 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vsll.vx v28, v28, a4 +; RV32-NEXT: vor.vv v16, v16, v28 +; RV32-NEXT: vsrl.vi v28, v8, 8 +; RV32-NEXT: vand.vx v20, v20, a5 +; RV32-NEXT: vand.vv v28, v28, v24 +; RV32-NEXT: vor.vv v20, v28, v20 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsetvli a4, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vor.vv v12, v20, v12 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v20, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v12, v12, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vand.vv v12, v12, v20 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -2222,59 +2219,59 @@ define @vp_bitreverse_nxv4i64_unmasked( %va ; RV64-LABEL: vp_bitreverse_nxv4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsll.vi v16, v16, 8 -; RV64-NEXT: vor.vv v12, v12, v16 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v16, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4 -; RV64-NEXT: vor.vv v16, v16, v20 -; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vsrl.vx v16, v8, a2 -; RV64-NEXT: vsrl.vx v20, v8, a4 -; RV64-NEXT: vand.vx v20, v20, a3 +; RV64-NEXT: vsrl.vi v16, v8, 24 +; RV64-NEXT: vsrl.vi v20, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v12, v8, a3 +; RV64-NEXT: vsrl.vx v24, v8, a5 +; RV64-NEXT: vand.vx v24, v24, a0 +; RV64-NEXT: vor.vv v12, v24, v12 +; RV64-NEXT: vand.vx v24, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vsll.vi v24, v24, 24 +; RV64-NEXT: vand.vx v20, v20, a2 ; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vsrl.vi v20, v8, 24 -; RV64-NEXT: vand.vx v20, v20, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v20, v8, a2 +; RV64-NEXT: vsll.vi v20, v20, 8 +; RV64-NEXT: vor.vv v20, v24, v20 +; RV64-NEXT: vsll.vx v24, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v20 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vor.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v24, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v12, v12, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v12, v12, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: ret @@ -2301,33 +2298,33 @@ define @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64_unmasked( %va ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v24, v8, v24 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -2588,62 +2591,78 @@ define @vp_bitreverse_nxv7i64_unmasked( %va ; ; RV64-LABEL: vp_bitreverse_nxv7i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v0, v0, v16 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsll.vx v8, v8, a5 ; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv7i64_unmasked: @@ -2668,33 +2687,33 @@ define @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64_unmasked( %va ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 ; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v24, v8, v24 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -2955,62 +2980,78 @@ define @vp_bitreverse_nxv8i64_unmasked( %va ; ; RV64-LABEL: vp_bitreverse_nxv8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v0, v0, v16 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsll.vx v8, v8, a5 ; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv8i64_unmasked: @@ -3040,69 +3081,69 @@ define @vp_bitreverse_nxv64i16( %va, @vp_bitreverse_nxv64i16( %va, @vp_bitreverse_nxv64i16( %va, @vp_bitreverse_nxv64i16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_bitreverse_nxv64i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: lui a2, 3 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: sub a4, a0, a3 +; CHECK-NEXT: sltu a5, a0, a4 +; CHECK-NEXT: addi a5, a5, -1 +; CHECK-NEXT: and a5, a5, a4 +; CHECK-NEXT: lui a6, 5 +; CHECK-NEXT: addi a4, a1, -241 +; CHECK-NEXT: addi a2, a2, 819 +; CHECK-NEXT: addi a1, a6, 1365 +; CHECK-NEXT: vsetvli zero, a5, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 8 ; CHECK-NEXT: vsll.vi v16, v16, 8 ; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: vsrl.vi v24, v16, 4 -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: addi a2, a2, -241 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vand.vx v16, v16, a2 +; CHECK-NEXT: vand.vx v16, v16, a4 +; CHECK-NEXT: vand.vx v24, v24, a4 ; CHECK-NEXT: vsll.vi v16, v16, 4 ; CHECK-NEXT: vor.vv v16, v24, v16 ; CHECK-NEXT: vsrl.vi v24, v16, 2 -; CHECK-NEXT: lui a3, 3 -; CHECK-NEXT: addi a3, a3, 819 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vand.vx v16, v16, a3 +; CHECK-NEXT: vand.vx v16, v16, a2 +; CHECK-NEXT: vand.vx v24, v24, a2 ; CHECK-NEXT: vsll.vi v16, v16, 2 ; CHECK-NEXT: vor.vv v16, v24, v16 ; CHECK-NEXT: vsrl.vi v24, v16, 1 -; CHECK-NEXT: lui a4, 5 -; CHECK-NEXT: addi a4, a4, 1365 -; CHECK-NEXT: vand.vx v24, v24, a4 -; CHECK-NEXT: vand.vx v16, v16, a4 +; CHECK-NEXT: vand.vx v16, v16, a1 +; CHECK-NEXT: vand.vx v24, v24, a1 ; CHECK-NEXT: vadd.vv v16, v16, v16 ; CHECK-NEXT: vor.vv v16, v24, v16 -; CHECK-NEXT: bltu a0, a1, .LBB47_2 +; CHECK-NEXT: bltu a0, a3, .LBB47_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: .LBB47_2: ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v24 ; CHECK-NEXT: vsrl.vi v24, v8, 4 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v8, v8, a4 +; CHECK-NEXT: vand.vx v24, v24, a4 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v24, v8 ; CHECK-NEXT: vsrl.vi v24, v8, 2 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vand.vx v8, v8, a3 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v24, v24, a2 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v24, v8 ; CHECK-NEXT: vsrl.vi v24, v8, 1 -; CHECK-NEXT: vand.vx v24, v24, a4 -; CHECK-NEXT: vand.vx v8, v8, a4 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v24, v24, a1 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v24, v8 ; CHECK-NEXT: ret @@ -3228,25 +3269,25 @@ define @vp_bitreverse_nxv1i9( %va, @bswap_nxv1i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -151,9 +151,9 @@ define @bswap_nxv2i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -178,9 +178,9 @@ define @bswap_nxv4i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsll.vi v12, v12, 8 @@ -205,9 +205,9 @@ define @bswap_nxv8i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsll.vi v16, v16, 8 @@ -232,9 +232,9 @@ define @bswap_nxv16i32( %va) { ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: vand.vx v24, v8, a0 ; CHECK-NEXT: vsll.vi v24, v24, 8 @@ -259,36 +259,36 @@ define @bswap_nxv1i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: li a1, 40 ; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v10, a2 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsrl.vi v10, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v11, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v10, v10, a3 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v12, v8, a1 +; RV32-NEXT: vand.vx v11, v11, a0 +; RV32-NEXT: vlse64.v v13, (a5), zero +; RV32-NEXT: vor.vv v10, v11, v10 +; RV32-NEXT: vand.vx v11, v8, a0 +; RV32-NEXT: vsll.vx v11, v11, a2 +; RV32-NEXT: vor.vv v11, v12, v11 ; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v12, v12, v11 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsll.vx v10, v8, a0 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: vsll.vx v12, v12, a1 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: vand.vx v12, v8, a3 -; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vand.vx v9, v9, a4 +; RV32-NEXT: vand.vv v12, v12, v13 +; RV32-NEXT: vor.vv v9, v12, v9 +; RV32-NEXT: vand.vv v12, v8, v13 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v11, v8 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -297,28 +297,28 @@ define @bswap_nxv1i64( %va) { ; RV64-LABEL: bswap_nxv1i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vx v9, v8, a0 ; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v10, v8, a1 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v10, v10, a2 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vsetvli a3, zero, e64, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v10, v10, a3 +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsrl.vx v11, v8, a1 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v11, v11, a2 +; RV64-NEXT: vor.vv v10, v11, v10 ; RV64-NEXT: vsrl.vi v11, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v9, v9, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v11, v11, a4 +; RV64-NEXT: vor.vv v9, v11, v9 +; RV64-NEXT: vand.vx v11, v8, a3 +; RV64-NEXT: vsll.vi v11, v11, 24 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a4 +; RV64-NEXT: vsll.vi v10, v10, 8 ; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vand.vx v10, v8, a3 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: vand.vx v11, v8, a4 -; RV64-NEXT: vsll.vi v11, v11, 8 -; RV64-NEXT: vor.vv v10, v10, v11 ; RV64-NEXT: vsll.vx v11, v8, a0 ; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vsll.vx v8, v8, a1 @@ -343,36 +343,36 @@ define @bswap_nxv2i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v10, v8, a0 -; RV32-NEXT: li a1, 40 ; RV32-NEXT: vsrl.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v12, a2 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v14, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v12, v12, a3 +; RV32-NEXT: vsrl.vx v14, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: vand.vx v14, v14, a0 +; RV32-NEXT: vlse64.v v18, (a5), zero +; RV32-NEXT: vor.vv v12, v14, v12 +; RV32-NEXT: vand.vx v14, v8, a0 +; RV32-NEXT: vsll.vx v14, v14, a2 +; RV32-NEXT: vor.vv v14, v16, v14 ; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v14 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsll.vx v12, v8, a0 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: vsll.vx v16, v16, a1 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: vand.vx v16, v8, a3 -; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vand.vx v10, v10, a4 +; RV32-NEXT: vand.vv v16, v16, v18 +; RV32-NEXT: vor.vv v10, v16, v10 +; RV32-NEXT: vand.vv v16, v8, v18 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v14, v8 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -381,28 +381,28 @@ define @bswap_nxv2i64( %va) { ; RV64-LABEL: bswap_nxv2i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 ; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v12, v8, a1 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v12, v12, a2 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v12, v12, a3 +; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vsrl.vx v14, v8, a1 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v14, v14, a2 +; RV64-NEXT: vor.vv v12, v14, v12 ; RV64-NEXT: vsrl.vi v14, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v10, v10, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v14, v14, a4 +; RV64-NEXT: vor.vv v10, v14, v10 +; RV64-NEXT: vand.vx v14, v8, a3 +; RV64-NEXT: vsll.vi v14, v14, 24 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: vand.vx v12, v8, a4 +; RV64-NEXT: vsll.vi v12, v12, 8 ; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vand.vx v12, v8, a3 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: vand.vx v14, v8, a4 -; RV64-NEXT: vsll.vi v14, v14, 8 -; RV64-NEXT: vor.vv v12, v12, v14 ; RV64-NEXT: vsll.vx v14, v8, a0 ; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vsll.vx v8, v8, a1 @@ -427,36 +427,36 @@ define @bswap_nxv4i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: vsetvli a4, zero, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vx v12, v8, a0 -; RV32-NEXT: li a1, 40 ; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v20, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v16, v16, a3 +; RV32-NEXT: vsrl.vx v20, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v24, v8, a1 +; RV32-NEXT: vand.vx v20, v20, a0 +; RV32-NEXT: vlse64.v v28, (a5), zero +; RV32-NEXT: vor.vv v16, v20, v16 +; RV32-NEXT: vand.vx v20, v8, a0 +; RV32-NEXT: vsll.vx v20, v20, a2 +; RV32-NEXT: vor.vv v20, v24, v20 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v20 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: vsll.vx v24, v24, a1 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: vand.vx v24, v8, a3 -; RV32-NEXT: vsll.vi v24, v24, 24 -; RV32-NEXT: vand.vv v8, v8, v20 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vand.vx v12, v12, a4 +; RV32-NEXT: vand.vv v24, v24, v28 +; RV32-NEXT: vor.vv v12, v24, v12 +; RV32-NEXT: vand.vv v24, v8, v28 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v20, v8 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -465,28 +465,28 @@ define @bswap_nxv4i64( %va) { ; RV64-LABEL: bswap_nxv4i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vx v12, v8, a0 ; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v16, v8, a1 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v16, v16, a2 -; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vsrl.vi v16, v8, 24 +; RV64-NEXT: vsetvli a3, zero, e64, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v20, v8, a1 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v20, v20, a2 +; RV64-NEXT: vor.vv v16, v20, v16 ; RV64-NEXT: vsrl.vi v20, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v12, v12, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v20, v20, a4 +; RV64-NEXT: vor.vv v12, v20, v12 +; RV64-NEXT: vand.vx v20, v8, a3 +; RV64-NEXT: vsll.vi v20, v20, 24 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: vand.vx v16, v8, a4 +; RV64-NEXT: vsll.vi v16, v16, 8 ; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vand.vx v16, v8, a3 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: vand.vx v20, v8, a4 -; RV64-NEXT: vsll.vi v20, v20, 8 -; RV64-NEXT: vor.vv v16, v16, v20 ; RV64-NEXT: vsll.vx v20, v8, a0 ; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vsll.vx v8, v8, a1 @@ -511,50 +511,57 @@ define @bswap_nxv8i64( %va) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a0, 1044480 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a0 -; RV32-NEXT: li a1, 40 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v24, a2 +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: vsll.vx v0, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a0 ; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v0, v0, a3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vx v16, v16, a2 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v0, (a5), zero +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a4 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a2 -; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v24, v8, a0 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vand.vx v8, v8, a3 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v24 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -564,28 +571,28 @@ define @bswap_nxv8i64( %va) { ; RV64-LABEL: bswap_nxv8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vx v16, v8, a0 ; RV64-NEXT: li a1, 40 -; RV64-NEXT: vsrl.vx v24, v8, a1 ; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -256 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 24 ; RV64-NEXT: lui a3, 4080 -; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v0, v8, a1 +; RV64-NEXT: addiw a2, a2, -256 +; RV64-NEXT: vand.vx v0, v0, a2 +; RV64-NEXT: vor.vv v16, v0, v16 ; RV64-NEXT: vsrl.vi v0, v8, 8 ; RV64-NEXT: li a4, 255 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: slli a4, a4, 24 ; RV64-NEXT: vand.vx v0, v0, a4 ; RV64-NEXT: vor.vv v24, v0, v24 +; RV64-NEXT: vand.vx v0, v8, a3 +; RV64-NEXT: vsll.vi v0, v0, 24 ; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsll.vi v24, v24, 24 -; RV64-NEXT: vand.vx v0, v8, a4 -; RV64-NEXT: vsll.vi v0, v0, 8 -; RV64-NEXT: vor.vv v24, v24, v0 +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vor.vv v24, v0, v24 ; RV64-NEXT: vsll.vx v0, v8, a0 ; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vsll.vx v8, v8, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll index 2c5b7f160d192..1c95ec8fafd4f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll @@ -270,9 +270,9 @@ define @vp_bswap_nxv1i32_unmasked( %va, i32 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -324,9 +324,9 @@ define @vp_bswap_nxv2i32_unmasked( %va, i32 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -378,9 +378,9 @@ define @vp_bswap_nxv4i32_unmasked( %va, i32 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsll.vi v12, v12, 8 @@ -432,9 +432,9 @@ define @vp_bswap_nxv8i32_unmasked( %va, i32 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsll.vi v16, v16, 8 @@ -486,9 +486,9 @@ define @vp_bswap_nxv16i32_unmasked( %va, ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsrl.vi v24, v8, 24 ; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: vand.vx v24, v8, a0 ; CHECK-NEXT: vsll.vi v24, v24, 8 @@ -514,38 +514,38 @@ define @vp_bswap_nxv1i64( %va, @vp_bswap_nxv1i64( %va, @vp_bswap_nxv1i64_unmasked( %va, i32 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v9, v10 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vand.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vor.vv v11, v12, v11 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4 -; RV32-NEXT: vsll.vi v11, v11, 24 -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v10, v10, v13 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vv v13, v13, v12 +; RV32-NEXT: vor.vv v9, v13, v9 +; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v9, v11 -; RV32-NEXT: vsrl.vx v11, v8, a1 -; RV32-NEXT: vsrl.vx v12, v8, a3 -; RV32-NEXT: vand.vx v12, v12, a2 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v11 -; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -639,34 +639,34 @@ define @vp_bswap_nxv1i64_unmasked( %va, i32 ; RV64-LABEL: vp_bswap_nxv1i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsll.vi v9, v9, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vor.vv v9, v9, v10 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v10, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4 -; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vsrl.vi v9, v8, 24 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v11, v8, a3 +; RV64-NEXT: vsrl.vx v12, v8, a5 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vx v10, v8, a2 -; RV64-NEXT: vsrl.vx v11, v8, a4 -; RV64-NEXT: vand.vx v11, v11, a3 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsrl.vi v11, v8, 24 -; RV64-NEXT: vand.vx v11, v11, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v11 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: vor.vv v9, v9, v11 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64_unmasked: @@ -686,38 +686,38 @@ define @vp_bswap_nxv2i64( %va, @vp_bswap_nxv2i64( %va, @vp_bswap_nxv2i64_unmasked( %va, i32 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v14, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vand.vx v18, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vor.vv v14, v16, v14 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4 -; RV32-NEXT: vsll.vi v14, v14, 24 -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsll.vx v18, v18, a4 +; RV32-NEXT: vor.vv v12, v12, v18 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v10, v10, a5 +; RV32-NEXT: vand.vv v18, v18, v16 +; RV32-NEXT: vor.vv v10, v18, v10 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v10, v14 -; RV32-NEXT: vsrl.vx v14, v8, a1 -; RV32-NEXT: vsrl.vx v16, v8, a3 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v14 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v10, v10, v14 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -811,34 +811,34 @@ define @vp_bswap_nxv2i64_unmasked( %va, i32 ; RV64-LABEL: vp_bswap_nxv2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vor.vv v10, v10, v12 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v12, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4 -; RV64-NEXT: vor.vv v12, v12, v14 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v14, v8, a3 +; RV64-NEXT: vsrl.vx v16, v8, a5 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vor.vv v14, v16, v14 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: vand.vx v12, v12, a2 ; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vx v12, v8, a2 -; RV64-NEXT: vsrl.vx v14, v8, a4 -; RV64-NEXT: vand.vx v14, v14, a3 -; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsrl.vi v14, v8, 24 -; RV64-NEXT: vand.vx v14, v14, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v12, v8, a2 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vor.vv v10, v10, v14 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64_unmasked: @@ -858,34 +858,34 @@ define @vp_bswap_nxv4i64( %va, @vp_bswap_nxv4i64( %va, @vp_bswap_nxv4i64_unmasked( %va, i32 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v20, v8, a2 +; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vand.vx v28, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a1 +; RV32-NEXT: vor.vv v20, v24, v20 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4 -; RV32-NEXT: vsll.vi v20, v20, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vx v28, v28, a4 +; RV32-NEXT: vor.vv v16, v16, v28 +; RV32-NEXT: vsrl.vi v28, v8, 8 +; RV32-NEXT: vand.vx v12, v12, a5 +; RV32-NEXT: vand.vv v28, v28, v24 +; RV32-NEXT: vor.vv v12, v28, v12 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vor.vv v12, v12, v20 -; RV32-NEXT: vsrl.vx v20, v8, a1 -; RV32-NEXT: vsrl.vx v24, v8, a3 -; RV32-NEXT: vand.vx v24, v24, a2 -; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v24, v8, 24 -; RV32-NEXT: vand.vx v24, v24, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v8, v20 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v12, v12, v20 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -983,34 +983,34 @@ define @vp_bswap_nxv4i64_unmasked( %va, i32 ; RV64-LABEL: vp_bswap_nxv4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsll.vi v16, v16, 8 -; RV64-NEXT: vor.vv v12, v12, v16 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v16, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4 -; RV64-NEXT: vor.vv v16, v16, v20 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v20, v8, a3 +; RV64-NEXT: vsrl.vx v24, v8, a5 +; RV64-NEXT: vand.vx v24, v24, a0 +; RV64-NEXT: vor.vv v20, v24, v20 +; RV64-NEXT: vand.vx v24, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vsll.vi v24, v24, 24 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vsrl.vx v16, v8, a2 -; RV64-NEXT: vsrl.vx v20, v8, a4 -; RV64-NEXT: vand.vx v20, v20, a3 -; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vsrl.vi v20, v8, 24 -; RV64-NEXT: vand.vx v20, v20, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsll.vx v24, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v24, v8 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: vor.vv v12, v12, v20 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64_unmasked: @@ -1035,33 +1035,33 @@ define @vp_bswap_nxv7i64( %va, @vp_bswap_nxv7i64( %va, @vp_bswap_nxv7i64( %va, @vp_bswap_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsll.vx v24, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: vsll.vx v0, v0, a4 +; RV32-NEXT: vor.vv v16, v24, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -1234,35 +1241,51 @@ define @vp_bswap_nxv7i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_bswap_nxv7i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: vsll.vx v0, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v0, v8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64_unmasked: @@ -1287,33 +1310,33 @@ define @vp_bswap_nxv8i64( %va, @vp_bswap_nxv8i64( %va, @vp_bswap_nxv8i64( %va, @vp_bswap_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsll.vx v24, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: vsll.vx v0, v0, a4 +; RV32-NEXT: vor.vv v16, v24, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -1486,35 +1516,51 @@ define @vp_bswap_nxv8i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_bswap_nxv8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: vsll.vx v0, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v0, v8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret ; ; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64_unmasked: @@ -1546,9 +1592,9 @@ define @vp_bswap_nxv64i16( %va, @vp_bswap_nxv64i16( %va, @vp_bswap_nxv1i48( %va, @vp_bswap_nxv1i48( %va, @ret_split_nxv64i32(ptr %x) { ; CHECK-LABEL: ret_split_nxv64i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vl8re32.v v8, (a1) ; CHECK-NEXT: slli a3, a2, 3 ; CHECK-NEXT: slli a4, a2, 5 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub a4, a4, a3 -; CHECK-NEXT: add a5, a1, a4 -; CHECK-NEXT: vl8re32.v v8, (a5) +; CHECK-NEXT: add a5, a1, a2 +; CHECK-NEXT: vl8re32.v v16, (a5) ; CHECK-NEXT: add a5, a1, a3 -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: vl8re32.v v16, (a1) -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: vl8re32.v v24, (a1) -; CHECK-NEXT: vl8re32.v v0, (a5) -; CHECK-NEXT: vs8r.v v16, (a0) ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vs8r.v v24, (a2) ; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: vs8r.v v0, (a3) -; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: vl8re32.v v24, (a5) +; CHECK-NEXT: vl8re32.v v0, (a1) ; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a2) +; CHECK-NEXT: vs8r.v v24, (a3) +; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: ret %v = load , ptr %x ret %v @@ -100,87 +100,99 @@ define fastcc @ret_split_nxv128i32(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a3, a2, 3 -; CHECK-NEXT: slli a4, a2, 5 -; CHECK-NEXT: sub a5, a4, a3 -; CHECK-NEXT: add a6, a1, a5 -; CHECK-NEXT: vl8re32.v v8, (a6) -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: li a7, 24 -; CHECK-NEXT: mul a6, a6, a7 -; CHECK-NEXT: add a6, sp, a6 -; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: vl8re32.v v8, (a1) +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: slli a5, a2, 5 ; CHECK-NEXT: slli a6, a2, 4 ; CHECK-NEXT: slli a7, a2, 6 +; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: sub a3, a5, a4 ; CHECK-NEXT: sub t0, a7, a6 -; CHECK-NEXT: add t1, a1, t0 +; CHECK-NEXT: sub a7, a7, a4 +; CHECK-NEXT: add t1, a1, a4 +; CHECK-NEXT: add t2, a1, a6 +; CHECK-NEXT: add t3, a1, a5 ; CHECK-NEXT: vl8re32.v v8, (t1) ; CHECK-NEXT: csrr t1, vlenb -; CHECK-NEXT: slli t1, t1, 4 +; CHECK-NEXT: li t4, 24 +; CHECK-NEXT: mul t1, t1, t4 ; CHECK-NEXT: add t1, sp, t1 ; CHECK-NEXT: addi t1, t1, 16 ; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add t1, a1, a7 +; CHECK-NEXT: add t1, a1, a2 +; CHECK-NEXT: vl8re32.v v8, (t2) +; CHECK-NEXT: csrr t2, vlenb +; CHECK-NEXT: slli t2, t2, 3 +; CHECK-NEXT: add t2, sp, t2 +; CHECK-NEXT: addi t2, t2, 16 +; CHECK-NEXT: vs8r.v v8, (t2) # Unknown-size Folded Spill +; CHECK-NEXT: add t2, a1, a3 +; CHECK-NEXT: vl8re32.v v16, (t3) +; CHECK-NEXT: add t3, a1, t0 +; CHECK-NEXT: add a1, a1, a7 ; CHECK-NEXT: vl8re32.v v8, (t1) +; CHECK-NEXT: vl8re32.v v24, (t2) ; CHECK-NEXT: csrr t1, vlenb -; CHECK-NEXT: slli t1, t1, 3 +; CHECK-NEXT: slli t1, t1, 4 ; CHECK-NEXT: add t1, sp, t1 ; CHECK-NEXT: addi t1, t1, 16 -; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill -; CHECK-NEXT: add t1, a1, a3 -; CHECK-NEXT: vl8re32.v v8, (t1) +; CHECK-NEXT: vs8r.v v24, (t1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v24, (t3) ; CHECK-NEXT: addi t1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill -; CHECK-NEXT: add t1, a1, a6 -; CHECK-NEXT: add t2, a1, a4 -; CHECK-NEXT: li t3, 40 -; CHECK-NEXT: mul a2, a2, t3 -; CHECK-NEXT: add t3, a1, a2 -; CHECK-NEXT: vl8re32.v v8, (a1) -; CHECK-NEXT: vl8re32.v v0, (t1) -; CHECK-NEXT: vl8re32.v v16, (t3) -; CHECK-NEXT: vl8re32.v v24, (t2) -; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v24, (t1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v24, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vs8r.v v16, (a2) -; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vs8r.v v24, (a4) +; CHECK-NEXT: vs8r.v v8, (a2) +; CHECK-NEXT: add a5, a0, a5 +; CHECK-NEXT: vs8r.v v16, (a5) ; CHECK-NEXT: add a6, a0, a6 -; CHECK-NEXT: vs8r.v v0, (a6) -; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vs8r.v v8, (a3) -; CHECK-NEXT: add a7, a0, a7 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vs8r.v v8, (a7) -; CHECK-NEXT: add t0, a0, t0 +; CHECK-NEXT: vs8r.v v8, (a6) +; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a4) +; CHECK-NEXT: add a7, a0, a7 +; CHECK-NEXT: vs8r.v v24, (a7) +; CHECK-NEXT: add t0, a0, t0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vs8r.v v8, (t0) -; CHECK-NEXT: add a0, a0, a5 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -236,40 +248,52 @@ define fastcc @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: vl8re32.v v24, (a0) -; CHECK-NEXT: vl8re32.v v0, (a1) -; CHECK-NEXT: vl8re32.v v16, (a3) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vl8re32.v v8, (a2) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v0, (a0) +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl8re32.v v8, (a0) ; CHECK-NEXT: vl8re32.v v16, (a2) -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v24, v8, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v0, v8, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vv v0, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v24, v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vadd.vv v8, v0, v8 -; CHECK-NEXT: vadd.vv v24, v24, v16 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: vadd.vx v16, v8, a4 ; CHECK-NEXT: vadd.vx v8, v24, a4 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -306,10 +330,10 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_i32( @ret_nxv32i32_call_nxv32i32_nxv32i32_i32( @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: andi sp, sp, -128 +; RV32-NEXT: addi a1, sp, 128 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a3, a2, a1 -; RV32-NEXT: vl8re32.v v24, (a3) +; RV32-NEXT: vl8re32.v v16, (a2) ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 128 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a2, a2, a1 ; RV32-NEXT: add a3, a0, a1 -; RV32-NEXT: vl8re32.v v24, (a3) -; RV32-NEXT: addi a3, sp, 128 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vl8re32.v v0, (a2) -; RV32-NEXT: vl8re32.v v24, (a0) +; RV32-NEXT: vl8re32.v v24, (a3) +; RV32-NEXT: vl8re32.v v16, (a0) ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 128 -; RV32-NEXT: vs8r.v v24, (a2) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 128 +; RV32-NEXT: vs8r.v v16, (a3) ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vs8r.v v16, (a0) -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: addi a2, sp, 128 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 @@ -418,16 +443,15 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 128 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: li a5, 42 -; RV32-NEXT: addi a3, sp, 128 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vs8r.v v8, (a1) -; RV32-NEXT: vmv8r.v v8, v0 +; RV32-NEXT: vs8r.v v24, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 128 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv8r.v v16, v0 ; RV32-NEXT: call ext3 ; RV32-NEXT: addi sp, s0, -144 ; RV32-NEXT: .cfi_def_cfa sp, 144 @@ -454,34 +478,35 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: andi sp, sp, -128 +; RV64-NEXT: addi a1, sp, 128 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a3, a2, a1 -; RV64-NEXT: vl8re32.v v24, (a3) +; RV64-NEXT: vl8re32.v v16, (a2) ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 128 -; RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a2, a2, a1 ; RV64-NEXT: add a3, a0, a1 -; RV64-NEXT: vl8re32.v v24, (a3) -; RV64-NEXT: addi a3, sp, 128 -; RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vl8re32.v v0, (a2) -; RV64-NEXT: vl8re32.v v24, (a0) +; RV64-NEXT: vl8re32.v v24, (a3) +; RV64-NEXT: vl8re32.v v16, (a0) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 5 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 128 -; RV64-NEXT: vs8r.v v24, (a2) +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 128 +; RV64-NEXT: vs8r.v v16, (a3) ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vs8r.v v16, (a0) -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: addi a2, sp, 128 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vs8r.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 5 ; RV64-NEXT: add a0, sp, a0 @@ -490,16 +515,15 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: slli a2, a2, 4 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 128 +; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: li a5, 42 -; RV64-NEXT: addi a3, sp, 128 -; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV64-NEXT: vs8r.v v8, (a1) -; RV64-NEXT: vmv8r.v v8, v0 +; RV64-NEXT: vs8r.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 128 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv8r.v v16, v0 ; RV64-NEXT: call ext3 ; RV64-NEXT: addi sp, s0, -144 ; RV64-NEXT: .cfi_def_cfa sp, 144 @@ -557,29 +581,29 @@ define fastcc @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack_no_gpr( @pass_vector_arg_indirect_stack_no_gpr( @pass_vector_arg_indirect_stack_no_gpr( @pass_vector_arg_indirect_stack_no_gpr( @caller_scalable_vector_split_indirect( @caller_scalable_vector_split_indirect( @vp_ceil_vv_nxv1bf16( %va, @vp_ceil_vv_nxv1bf16_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -69,10 +69,10 @@ define @vp_ceil_vv_nxv2bf16( %va, @vp_ceil_vv_nxv2bf16_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -120,10 +120,10 @@ define @vp_ceil_vv_nxv4bf16( %va, @vp_ceil_vv_nxv4bf16_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -172,10 +172,10 @@ define @vp_ceil_vv_nxv8bf16( %va, @vp_ceil_vv_nxv8bf16_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -224,10 +224,10 @@ define @vp_ceil_vv_nxv16bf16( %va, ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 @@ -250,10 +250,10 @@ define @vp_ceil_vv_nxv16bf16_unmasked( @vp_ceil_vv_nxv32bf16( %va, ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v17, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v17 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v0, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: bltu a0, a1, .LBB10_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB10_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 -; CHECK-NEXT: vmv1r.v v8, v16 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t @@ -354,51 +346,41 @@ define @vp_ceil_vv_nxv32bf16( %va, define @vp_ceil_vv_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -410,12 +392,6 @@ define @vp_ceil_vv_nxv32bf16_unmasked( @llvm.vp.ceil.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -444,10 +420,10 @@ define @vp_ceil_vv_nxv1f16( %va, @vp_ceil_vv_nxv1f16_unmasked( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -526,10 +502,10 @@ define @vp_ceil_vv_nxv2f16( %va, @vp_ceil_vv_nxv2f16_unmasked( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -609,10 +585,10 @@ define @vp_ceil_vv_nxv4f16( %va, @vp_ceil_vv_nxv4f16_unmasked( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -673,9 +649,9 @@ declare @llvm.vp.ceil.nxv8f16(, @vp_ceil_vv_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -695,10 +671,10 @@ define @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16_unmasked( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -759,9 +735,9 @@ declare @llvm.vp.ceil.nxv16f16(, @vp_ceil_vv_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -781,10 +757,10 @@ define @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16_unmasked( % ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -845,9 +821,9 @@ declare @llvm.vp.ceil.nxv32f16(, @vp_ceil_vv_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -870,62 +846,54 @@ define @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16_unmasked( % ; ; ZVFHMIN-LABEL: vp_ceil_vv_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a2, 3 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a2 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -1016,12 +974,6 @@ define @vp_ceil_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.ceil.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1290,9 +1242,9 @@ declare @llvm.vp.ceil.nxv2f64(, @vp_ceil_vv_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1334,9 +1286,9 @@ declare @llvm.vp.ceil.nxv4f64(, @vp_ceil_vv_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1378,9 +1330,9 @@ declare @llvm.vp.ceil.nxv7f64(, @vp_ceil_vv_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1422,9 +1374,9 @@ declare @llvm.vp.ceil.nxv8f64(, @vp_ceil_vv_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1475,12 +1427,12 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v6, v0, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI44_0) +; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI44_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1501,23 +1453,26 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -1533,12 +1488,12 @@ define @vp_ceil_vv_nxv16f64_unmasked( %mask, <256 x i8> %data) { ; RV64-LABEL: test_compresstore_v256i8: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 4 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v7, v8 ; RV64-NEXT: li a2, 128 -; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64-NEXT: vle8.v v16, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vslidedown.vi v9, v0, 1 -; RV64-NEXT: vmv.x.s a1, v9 ; RV64-NEXT: vmv.x.s a3, v0 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 3 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64-NEXT: vcompress.vm v16, v24, v0 +; RV64-NEXT: vle8.v v24, (a1) +; RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV64-NEXT: vmv.x.s a1, v9 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vcompress.vm v8, v16, v0 ; RV64-NEXT: vcpop.m a4, v0 ; RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; RV64-NEXT: vse8.v v16, (a0) -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64-NEXT: vcompress.vm v16, v24, v8 -; RV64-NEXT: vcpop.m a2, v8 +; RV64-NEXT: vcpop.m a2, v7 ; RV64-NEXT: cpop a3, a3 ; RV64-NEXT: cpop a1, a1 ; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: vcompress.vm v8, v24, v7 ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64-NEXT: vse8.v v16, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-LABEL: test_compresstore_v256i8: ; RV32: # %bb.0: # %entry -; RV32-NEXT: vmv1r.v v7, v8 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 ; RV32-NEXT: li a2, 128 -; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV32-NEXT: vle8.v v24, (a1) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vslidedown.vi v9, v0, 1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v9, a1 -; RV32-NEXT: vmv.x.s a3, v10 -; RV32-NEXT: vsrl.vx v10, v0, a1 -; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: vmv.x.s a4, v9 -; RV32-NEXT: vmv.x.s a5, v0 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vmv.x.s a4, v0 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vle8.v v16, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v10, v9, a3 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vsrl.vx v9, v0, a3 ; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV32-NEXT: vcompress.vm v8, v16, v0 -; RV32-NEXT: vcpop.m a6, v0 -; RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma -; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: vcompress.vm v16, v24, v0 +; RV32-NEXT: vcpop.m a3, v0 +; RV32-NEXT: cpop a4, a4 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmv.x.s a5, v10 +; RV32-NEXT: vmv.x.s a6, v9 +; RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV32-NEXT: vse8.v v16, (a0) ; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: cpop a3, a6 ; RV32-NEXT: cpop a5, a5 -; RV32-NEXT: add a1, a5, a1 -; RV32-NEXT: cpop a3, a3 -; RV32-NEXT: cpop a4, a4 ; RV32-NEXT: add a3, a4, a3 -; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a1, a1, a5 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV32-NEXT: vcompress.vm v8, v24, v7 -; RV32-NEXT: vcpop.m a1, v7 +; RV32-NEXT: vcompress.vm v16, v24, v8 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vcpop.m a1, v8 ; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: vse8.v v16, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret entry: tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask) @@ -463,43 +457,45 @@ define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %da ; RV64-NEXT: vse16.v v24, (a0) ; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64-NEXT: vslidedown.vi v8, v0, 8 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.x.s a2, v0 ; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; RV64-NEXT: vcompress.vm v24, v16, v8 -; RV64-NEXT: vcpop.m a2, v8 -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: cpop a1, a1 -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64-NEXT: vcpop.m a1, v8 +; RV64-NEXT: cpop a2, a2 +; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; RV64-NEXT: vse16.v v24, (a0) ; RV64-NEXT: ret ; ; RV32-LABEL: test_compresstore_v128i16: ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a1, 64 -; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV32-NEXT: vcompress.vm v24, v8, v0 -; RV32-NEXT: vcpop.m a2, v0 -; RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; RV32-NEXT: vse16.v v24, (a0) ; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v24, v0, 8 +; RV32-NEXT: vslidedown.vi v7, v0, 8 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vcompress.vm v24, v16, v7 +; RV32-NEXT: vcpop.m a2, v7 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vmv.x.s a4, v0 ; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV32-NEXT: vcompress.vm v8, v16, v24 -; RV32-NEXT: vcpop.m a1, v24 -; RV32-NEXT: li a2, 32 +; RV32-NEXT: vcompress.vm v16, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v16, v0, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: cpop a2, a2 -; RV32-NEXT: vmv.x.s a3, v0 -; RV32-NEXT: cpop a3, a3 -; RV32-NEXT: add a2, a3, a2 -; RV32-NEXT: slli a2, a2, 1 -; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vsrl.vx v8, v0, a3 ; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV32-NEXT: vse16.v v8, (a0) +; RV32-NEXT: vse16.v v16, (a0) +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: cpop a3, a4 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32-NEXT: vse16.v v24, (a0) ; RV32-NEXT: ret entry: tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask) @@ -659,10 +655,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) ; RV64-NEXT: vse32.v v24, (a0) ; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v0, 4 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.x.s a2, v0 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vcompress.vm v24, v16, v8 ; RV64-NEXT: vcpop.m a1, v8 -; RV64-NEXT: vmv.x.s a2, v0 ; RV64-NEXT: cpopw a2, a2 ; RV64-NEXT: slli a2, a2, 2 ; RV64-NEXT: add a0, a0, a2 @@ -680,10 +677,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) ; RV32-NEXT: vse32.v v24, (a0) ; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v0, 4 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.x.s a2, v0 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vcompress.vm v24, v16, v8 ; RV32-NEXT: vcpop.m a1, v8 -; RV32-NEXT: vmv.x.s a2, v0 ; RV32-NEXT: cpop a2, a2 ; RV32-NEXT: slli a2, a2, 2 ; RV32-NEXT: add a0, a0, a2 @@ -822,10 +820,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data) ; RV64-NEXT: vse64.v v24, (a0) ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v8, v0, 2 +; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64-NEXT: vmv.x.s a1, v0 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vcompress.vm v24, v16, v8 -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 ; RV64-NEXT: zext.h a1, a1 ; RV64-NEXT: cpopw a1, a1 ; RV64-NEXT: slli a1, a1, 3 @@ -844,10 +842,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data) ; RV32-NEXT: vse64.v v24, (a0) ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV32-NEXT: vmv.x.s a1, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vcompress.vm v24, v16, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 ; RV32-NEXT: zext.h a1, a1 ; RV32-NEXT: cpop a1, a1 ; RV32-NEXT: slli a1, a1, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll index 7839b602706db..ad176df71397e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll @@ -18,50 +18,52 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b, <4 x i1> %sel) { ; RV32-LABEL: constant_folding_crash: ; RV32: # %bb.0: # %entry +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: lw a0, 8(a0) ; RV32-NEXT: andi a0, a0, 1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vmsne.vi v10, v10, 0 -; RV32-NEXT: vmv1r.v v11, v0 -; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vmv.v.x v11, a0 +; RV32-NEXT: vmsne.vi v0, v11, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmv1r.v v0, v11 -; RV32-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-NEXT: vrgather.vi v9, v8, 0 -; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v11, 10 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vmsne.vi v0, v10, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v8, 10 -; RV32-NEXT: vse32.v v8, (a0), v0.t +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vse32.v v11, (a0), v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: constant_folding_crash: ; RV64: # %bb.0: # %entry +; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: ld a0, 8(a0) ; RV64-NEXT: andi a0, a0, 1 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vmsne.vi v12, v12, 0 -; RV64-NEXT: vmv1r.v v13, v0 -; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vmv.v.x v13, a0 +; RV64-NEXT: vmsne.vi v0, v13, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 -; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmv1r.v v0, v13 -; RV64-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-NEXT: vrgather.vi v9, v8, 0 -; RV64-NEXT: vmsne.vi v0, v9, 0 +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vmv.v.i v8, 10 -; RV64-NEXT: vse32.v v8, (a0), v0.t +; RV64-NEXT: vmv.v.i v10, 10 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64-NEXT: vrgather.vi v11, v9, 0 +; RV64-NEXT: vmsne.vi v0, v11, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vse32.v v10, (a0), v0.t ; RV64-NEXT: ret entry: %sunkaddr = getelementptr i8, ptr %v54, i64 8 diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index d51f5eacd7d91..208735b18cbab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -13,6 +13,7 @@ define @ctlz_nxv1i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -20,10 +21,9 @@ define @ctlz_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -37,11 +37,11 @@ define @ctlz_nxv1i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 8 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -51,11 +51,11 @@ define @ctlz_nxv1i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v9, v8 +; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-D-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-D-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 8 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -76,6 +76,7 @@ define @ctlz_nxv2i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -83,10 +84,9 @@ define @ctlz_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -100,11 +100,11 @@ define @ctlz_nxv2i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 8 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -114,11 +114,11 @@ define @ctlz_nxv2i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v9, v8 +; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-D-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-D-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 8 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -139,6 +139,7 @@ define @ctlz_nxv4i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -146,10 +147,9 @@ define @ctlz_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -163,11 +163,11 @@ define @ctlz_nxv4i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 ; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 8 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -177,11 +177,11 @@ define @ctlz_nxv4i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vzext.vf2 v9, v8 +; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 ; CHECK-D-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 8 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -202,6 +202,7 @@ define @ctlz_nxv8i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -209,10 +210,9 @@ define @ctlz_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -226,11 +226,11 @@ define @ctlz_nxv8i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 ; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-F-NEXT: vnsrl.wi v10, v8, 0 -; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vrsub.vx v8, v10, a0 ; CHECK-F-NEXT: li a0, 8 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -240,11 +240,11 @@ define @ctlz_nxv8i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v8 +; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v10 ; CHECK-D-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-D-NEXT: vnsrl.wi v10, v8, 0 -; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vrsub.vx v8, v10, a0 ; CHECK-D-NEXT: li a0, 8 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -265,6 +265,7 @@ define @ctlz_nxv16i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 @@ -272,10 +273,9 @@ define @ctlz_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -289,11 +289,11 @@ define @ctlz_nxv16i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 ; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v12, v8, 0 -; CHECK-F-NEXT: li a0, 134 ; CHECK-F-NEXT: vrsub.vx v8, v12, a0 ; CHECK-F-NEXT: li a0, 8 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -303,11 +303,11 @@ define @ctlz_nxv16i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v12, v8 +; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v12 ; CHECK-D-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v12, v8, 0 -; CHECK-D-NEXT: li a0, 134 ; CHECK-D-NEXT: vrsub.vx v8, v12, a0 ; CHECK-D-NEXT: li a0, 8 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -328,6 +328,7 @@ define @ctlz_nxv32i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v12 @@ -335,10 +336,9 @@ define @ctlz_nxv32i8( %va) { ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -363,6 +363,7 @@ define @ctlz_nxv64i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 @@ -370,10 +371,9 @@ define @ctlz_nxv64i8( %va) { ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -398,7 +398,9 @@ define @ctlz_nxv1i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -407,20 +409,18 @@ define @ctlz_nxv1i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -431,8 +431,8 @@ define @ctlz_nxv1i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -442,8 +442,8 @@ define @ctlz_nxv1i16( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-D-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-D-NEXT: li a0, 142 +; CHECK-D-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -464,7 +464,9 @@ define @ctlz_nxv2i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -473,20 +475,18 @@ define @ctlz_nxv2i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -497,8 +497,8 @@ define @ctlz_nxv2i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -508,8 +508,8 @@ define @ctlz_nxv2i16( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-D-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-D-NEXT: li a0, 142 +; CHECK-D-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -530,7 +530,9 @@ define @ctlz_nxv4i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -539,20 +541,18 @@ define @ctlz_nxv4i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -563,8 +563,8 @@ define @ctlz_nxv4i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 -; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -574,8 +574,8 @@ define @ctlz_nxv4i16( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v8 -; CHECK-D-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-D-NEXT: li a0, 142 +; CHECK-D-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -596,7 +596,9 @@ define @ctlz_nxv8i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 @@ -605,20 +607,18 @@ define @ctlz_nxv8i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -629,8 +629,8 @@ define @ctlz_nxv8i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 -; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -640,8 +640,8 @@ define @ctlz_nxv8i16( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v8 -; CHECK-D-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-D-NEXT: li a0, 142 +; CHECK-D-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -662,7 +662,9 @@ define @ctlz_nxv16i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 @@ -671,20 +673,18 @@ define @ctlz_nxv16i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -695,8 +695,8 @@ define @ctlz_nxv16i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 -; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 ; CHECK-F-NEXT: vminu.vx v8, v8, a0 @@ -706,8 +706,8 @@ define @ctlz_nxv16i16( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v8 -; CHECK-D-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-D-NEXT: li a0, 142 +; CHECK-D-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 ; CHECK-D-NEXT: vminu.vx v8, v8, a0 @@ -728,7 +728,9 @@ define @ctlz_nxv32i16( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 4 @@ -737,20 +739,18 @@ define @ctlz_nxv32i16( %va) { ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -772,7 +772,9 @@ define @ctlz_nxv1i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -783,20 +785,18 @@ define @ctlz_nxv1i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -809,8 +809,8 @@ define @ctlz_nxv1i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 ; CHECK-F-NEXT: vminu.vx v8, v8, a1 @@ -844,7 +844,9 @@ define @ctlz_nxv2i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -855,20 +857,18 @@ define @ctlz_nxv2i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -881,8 +881,8 @@ define @ctlz_nxv2i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 ; CHECK-F-NEXT: vminu.vx v8, v8, a1 @@ -916,7 +916,9 @@ define @ctlz_nxv4i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 @@ -927,20 +929,18 @@ define @ctlz_nxv4i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -953,8 +953,8 @@ define @ctlz_nxv4i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 ; CHECK-F-NEXT: vminu.vx v8, v8, a1 @@ -988,7 +988,9 @@ define @ctlz_nxv8i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 @@ -999,20 +1001,18 @@ define @ctlz_nxv8i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -1025,8 +1025,8 @@ define @ctlz_nxv8i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 ; CHECK-F-NEXT: vminu.vx v8, v8, a1 @@ -1060,7 +1060,9 @@ define @ctlz_nxv16i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 @@ -1071,20 +1073,18 @@ define @ctlz_nxv16i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v16, v16, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vand.vx v16, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v16, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -1097,8 +1097,8 @@ define @ctlz_nxv16i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 ; CHECK-F-NEXT: vminu.vx v8, v8, a1 @@ -1110,8 +1110,8 @@ define @ctlz_nxv16i32( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vi v8, v8, 23 ; CHECK-D-NEXT: li a1, 158 +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 ; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 32 ; CHECK-D-NEXT: vminu.vx v8, v8, a1 @@ -1133,6 +1133,12 @@ define @ctlz_nxv1i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vsrl.vi v9, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v9 @@ -1142,40 +1148,34 @@ define @ctlz_nxv1i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vsrl.vi v9, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v9, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1185,6 +1185,23 @@ define @ctlz_nxv1i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v9 @@ -1194,37 +1211,20 @@ define @ctlz_nxv1i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v9 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vx v9, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v9, v9, a0 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vx v9, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1232,16 +1232,16 @@ define @ctlz_nxv1i64( %va) { ; CHECK-F-LABEL: ctlz_nxv1i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-F-NEXT: vmv.v.x v9, a0 -; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 -; CHECK-F-NEXT: vsrl.vi v8, v10, 23 -; CHECK-F-NEXT: vwsubu.vv v10, v9, v8 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: fsrmi a1, 1 +; CHECK-F-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 +; CHECK-F-NEXT: vmv.v.x v8, a0 +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v10, a1 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vminu.vx v8, v10, a0 +; CHECK-F-NEXT: fsrm a1 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv1i64: @@ -1273,6 +1273,12 @@ define @ctlz_nxv2i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vsrl.vi v10, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v10 @@ -1282,40 +1288,34 @@ define @ctlz_nxv2i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vsrl.vi v10, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v10, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1325,6 +1325,23 @@ define @ctlz_nxv2i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v10 @@ -1334,37 +1351,20 @@ define @ctlz_nxv2i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v10 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vx v10, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v10, v10, a0 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vx v10, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1372,16 +1372,16 @@ define @ctlz_nxv2i64( %va) { ; CHECK-F-LABEL: ctlz_nxv2i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-F-NEXT: vmv.v.x v10, a0 -; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v11, v8 -; CHECK-F-NEXT: vsrl.vi v8, v11, 23 -; CHECK-F-NEXT: vwsubu.vv v12, v10, v8 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: fsrmi a1, 1 +; CHECK-F-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vmv.v.x v8, a0 +; CHECK-F-NEXT: vsrl.vi v9, v10, 23 +; CHECK-F-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v12, a1 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vminu.vx v8, v10, a0 +; CHECK-F-NEXT: fsrm a1 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv2i64: @@ -1413,6 +1413,12 @@ define @ctlz_nxv4i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vsrl.vi v12, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v12 @@ -1422,40 +1428,34 @@ define @ctlz_nxv4i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vsrl.vi v12, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v12 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v12, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1465,6 +1465,23 @@ define @ctlz_nxv4i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v12 @@ -1474,37 +1491,20 @@ define @ctlz_nxv4i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v12 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vx v12, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v12, v12, a0 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vx v12, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1512,16 +1512,16 @@ define @ctlz_nxv4i64( %va) { ; CHECK-F-LABEL: ctlz_nxv4i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-F-NEXT: vmv.v.x v12, a0 -; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v14, v8 -; CHECK-F-NEXT: vsrl.vi v8, v14, 23 -; CHECK-F-NEXT: vwsubu.vv v16, v12, v8 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: fsrmi a1, 1 +; CHECK-F-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 +; CHECK-F-NEXT: vmv.v.x v8, a0 +; CHECK-F-NEXT: vsrl.vi v10, v12, 23 +; CHECK-F-NEXT: vwsubu.vv v12, v8, v10 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v16, a1 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vminu.vx v8, v12, a0 +; CHECK-F-NEXT: fsrm a1 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv4i64: @@ -1553,6 +1553,12 @@ define @ctlz_nxv8i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32I-NEXT: vmv.v.x v24, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vsrl.vi v16, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v16 @@ -1562,41 +1568,35 @@ define @ctlz_nxv8i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vsrl.vi v16, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v16 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v16, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vv v24, v16, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v24, v8, v16 ; RV32I-NEXT: vsrl.vi v8, v8, 2 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v24, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret @@ -1605,6 +1605,23 @@ define @ctlz_nxv8i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v16 @@ -1614,37 +1631,20 @@ define @ctlz_nxv8i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v16 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vx v16, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v16, v16, a0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vx v16, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1652,16 +1652,16 @@ define @ctlz_nxv8i64( %va) { ; CHECK-F-LABEL: ctlz_nxv8i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-F-NEXT: vmv.v.x v16, a0 -; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v20, v8 -; CHECK-F-NEXT: vsrl.vi v8, v20, 23 -; CHECK-F-NEXT: vwsubu.vv v24, v16, v8 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: fsrmi a1, 1 +; CHECK-F-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 +; CHECK-F-NEXT: vmv.v.x v8, a0 +; CHECK-F-NEXT: vsrl.vi v12, v16, 23 +; CHECK-F-NEXT: vwsubu.vv v16, v8, v12 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v24, a1 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vminu.vx v8, v16, a0 +; CHECK-F-NEXT: fsrm a1 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv8i64: @@ -1693,6 +1693,7 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -1700,10 +1701,9 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1751,6 +1751,7 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -1758,10 +1759,9 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1809,6 +1809,7 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -1816,10 +1817,9 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1867,6 +1867,7 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 @@ -1874,10 +1875,9 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1925,6 +1925,7 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 @@ -1932,10 +1933,9 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: li a0, 85 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1983,6 +1983,7 @@ define @ctlz_zero_undef_nxv32i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v12 @@ -1990,10 +1991,9 @@ define @ctlz_zero_undef_nxv32i8( %va) { ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2017,6 +2017,7 @@ define @ctlz_zero_undef_nxv64i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 @@ -2024,10 +2025,9 @@ define @ctlz_zero_undef_nxv64i8( %va) { ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2051,7 +2051,9 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -2060,20 +2062,18 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2112,7 +2112,9 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -2121,20 +2123,18 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2173,7 +2173,9 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -2182,20 +2184,18 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2234,7 +2234,9 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 @@ -2243,20 +2245,18 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2295,7 +2295,9 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 @@ -2304,20 +2306,18 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 5 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2356,7 +2356,9 @@ define @ctlz_zero_undef_nxv32i16( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 4 @@ -2365,20 +2367,18 @@ define @ctlz_zero_undef_nxv32i16( %va) { ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2399,7 +2399,9 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -2410,20 +2412,18 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2466,7 +2466,9 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 @@ -2477,20 +2479,18 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2533,7 +2533,9 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 @@ -2544,20 +2546,18 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2600,7 +2600,9 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 @@ -2611,20 +2613,18 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2667,7 +2667,9 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 +; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 +; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 2 ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 @@ -2678,20 +2680,18 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-ZVE64X-NEXT: vor.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vnot.v v8, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 -; CHECK-ZVE64X-NEXT: lui a0, 349525 -; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 ; CHECK-ZVE64X-NEXT: vand.vx v16, v16, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vand.vx v16, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v16, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2735,6 +2735,12 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vsrl.vi v9, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v9 @@ -2744,40 +2750,34 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vsrl.vi v9, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v9, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2787,6 +2787,23 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v9 @@ -2796,37 +2813,20 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v9 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vx v9, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v9, v9, a0 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vx v9, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2869,6 +2869,12 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vsrl.vi v10, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v10 @@ -2878,40 +2884,34 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vsrl.vi v10, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v10, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2921,6 +2921,23 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v10 @@ -2930,37 +2947,20 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v10 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vx v10, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v10, v10, a0 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vx v10, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3003,6 +3003,12 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vsrl.vi v12, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v12 @@ -3012,40 +3018,34 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vsrl.vi v12, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v12 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v12, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -3055,6 +3055,23 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v12 @@ -3064,37 +3081,20 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v12 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vx v12, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v12, v12, a0 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vx v12, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3137,6 +3137,12 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32I-NEXT: vmv.v.x v24, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vsrl.vi v16, v8, 2 ; RV32I-NEXT: vor.vv v8, v8, v16 @@ -3146,41 +3152,35 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vsrl.vi v16, v8, 16 ; RV32I-NEXT: vor.vv v8, v8, v16 -; RV32I-NEXT: li a0, 32 ; RV32I-NEXT: vsrl.vx v16, v8, a0 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vv v24, v16, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v24, v8, v16 ; RV32I-NEXT: vsrl.vi v8, v8, 2 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v24, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret @@ -3189,6 +3189,23 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: lui a3, 4112 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: addiw a3, a3, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: li a4, 32 ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 2 ; RV64I-NEXT: vor.vv v8, v8, v16 @@ -3198,37 +3215,20 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v16 -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: vsrl.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vx v16, v8, a4 ; RV64I-NEXT: vor.vv v8, v8, v16 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v16, v16, a0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vx v16, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index 2c9f633b89014..f56a792fdef6a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -15,6 +15,7 @@ define @vp_ctlz_nxv1i8( %va, @vp_ctlz_nxv1i8( %va, @vp_ctlz_nxv1i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 8 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -68,6 +68,7 @@ define @vp_ctlz_nxv2i8( %va, @vp_ctlz_nxv2i8( %va, @vp_ctlz_nxv2i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vfwcvt.f.xu.v v8, v9 ; CHECK-NEXT: vnsrl.wi v8, v8, 23 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 8 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -121,6 +121,7 @@ define @vp_ctlz_nxv4i8( %va, @vp_ctlz_nxv4i8( %va, @vp_ctlz_nxv4i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vfwcvt.f.xu.v v10, v9 ; CHECK-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 8 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -174,6 +174,7 @@ define @vp_ctlz_nxv8i8( %va, @vp_ctlz_nxv8i8( %va, @vp_ctlz_nxv8i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vfwcvt.f.xu.v v12, v10 ; CHECK-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vnsrl.wi v10, v8, 0 -; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vrsub.vx v8, v10, a0 ; CHECK-NEXT: li a0, 8 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -227,6 +227,7 @@ define @vp_ctlz_nxv16i8( %va, @vp_ctlz_nxv16i8( %va, @vp_ctlz_nxv16i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vfwcvt.f.xu.v v16, v12 ; CHECK-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vnsrl.wi v12, v8, 0 -; CHECK-NEXT: li a0, 134 ; CHECK-NEXT: vrsub.vx v8, v12, a0 ; CHECK-NEXT: li a0, 8 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -280,6 +280,7 @@ define @vp_ctlz_nxv32i8( %va, @vp_ctlz_nxv32i8( %va, @vp_ctlz_nxv32i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v12 @@ -321,10 +322,9 @@ define @vp_ctlz_nxv32i8_unmasked( %va, i32 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -350,6 +350,7 @@ define @vp_ctlz_nxv64i8( %va, @vp_ctlz_nxv64i8( %va, @vp_ctlz_nxv64i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 @@ -391,10 +392,9 @@ define @vp_ctlz_nxv64i8_unmasked( %va, i32 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -420,11 +420,11 @@ define @vp_ctlz_nxv1i16( %va, @vp_ctlz_nxv1i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-NEXT: li a0, 142 +; CHECK-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -467,11 +467,11 @@ define @vp_ctlz_nxv2i16( %va, @vp_ctlz_nxv2i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-NEXT: li a0, 142 +; CHECK-NEXT: vnsrl.wi v8, v9, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -514,11 +514,11 @@ define @vp_ctlz_nxv4i16( %va, @vp_ctlz_nxv4i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v10, v8 -; CHECK-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-NEXT: li a0, 142 +; CHECK-NEXT: vnsrl.wi v8, v10, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -561,11 +561,11 @@ define @vp_ctlz_nxv8i16( %va, @vp_ctlz_nxv8i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v12, v8 -; CHECK-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-NEXT: li a0, 142 +; CHECK-NEXT: vnsrl.wi v8, v12, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -608,11 +608,11 @@ define @vp_ctlz_nxv16i16( %va, @vp_ctlz_nxv16i16_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 -; CHECK-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-NEXT: li a0, 142 +; CHECK-NEXT: vnsrl.wi v8, v16, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -655,7 +655,9 @@ define @vp_ctlz_nxv32i16( %va, @vp_ctlz_nxv32i16( %va, @vp_ctlz_nxv32i16_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v16, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vsrl.vi v16, v8, 4 @@ -707,20 +709,18 @@ define @vp_ctlz_nxv32i16_unmasked( %va, i ; CHECK-NEXT: vor.vv v8, v8, v16 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -746,9 +746,9 @@ define @vp_ctlz_nxv1i32( %va, @vp_ctlz_nxv2i32( %va, @vp_ctlz_nxv4i32( %va, @vp_ctlz_nxv8i32( %va, @vp_ctlz_nxv16i32( %va, @vp_ctlz_nxv16i32_unmasked( %va, i ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-NEXT: vsrl.vi v8, v8, 23 ; CHECK-NEXT: li a0, 158 +; CHECK-NEXT: vsrl.vi v8, v8, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vminu.vx v8, v8, a0 @@ -1237,20 +1237,20 @@ define @vp_ctlz_nxv16i64( %va, @vp_ctlz_nxv16i64( %va, @vp_ctlz_nxv16i64_unmasked( %va, i ; CHECK-LABEL: vp_ctlz_nxv16i64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: fsrmi a3, 1 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.f.xu.v v16, v16 -; CHECK-NEXT: fsrm a3 +; CHECK-NEXT: fsrmi a4, 1 ; CHECK-NEXT: li a2, 52 -; CHECK-NEXT: vsrl.vx v16, v16, a2 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a5, a0, a3 +; CHECK-NEXT: addi a5, a5, -1 +; CHECK-NEXT: and a5, a5, a3 ; CHECK-NEXT: li a3, 1086 +; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma +; CHECK-NEXT: vfcvt.f.xu.v v16, v16 +; CHECK-NEXT: fsrm a4 +; CHECK-NEXT: vsrl.vx v16, v16, a2 ; CHECK-NEXT: vrsub.vx v16, v16, a3 ; CHECK-NEXT: li a4, 64 ; CHECK-NEXT: vminu.vx v16, v16, a4 @@ -1349,6 +1349,7 @@ define @vp_ctlz_zero_undef_nxv1i8( %va, @vp_ctlz_zero_undef_nxv1i8( %va, @vp_ctlz_zero_undef_nxv2i8( %va, @vp_ctlz_zero_undef_nxv2i8( %va, @vp_ctlz_zero_undef_nxv4i8( %va, @vp_ctlz_zero_undef_nxv4i8( %va, @vp_ctlz_zero_undef_nxv8i8( %va, @vp_ctlz_zero_undef_nxv8i8( %va, @vp_ctlz_zero_undef_nxv16i8( %va, @vp_ctlz_zero_undef_nxv16i8( %va, @vp_ctlz_zero_undef_nxv32i8( %va, @vp_ctlz_zero_undef_nxv32i8( %va, @vp_ctlz_zero_undef_nxv32i8_unmasked( @vp_ctlz_zero_undef_nxv32i8_unmasked( @vp_ctlz_zero_undef_nxv64i8( %va, @vp_ctlz_zero_undef_nxv64i8( %va, @vp_ctlz_zero_undef_nxv64i8_unmasked( @vp_ctlz_zero_undef_nxv64i8_unmasked( @vp_ctlz_zero_undef_nxv1i16( %va, @vp_ctlz_zero_undef_nxv2i16( %va, @vp_ctlz_zero_undef_nxv4i16( %va, @vp_ctlz_zero_undef_nxv8i16( %va, @vp_ctlz_zero_undef_nxv16i16( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8, v0.t +; CHECK-NEXT: li a0, 142 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v8, v16, 23, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t -; CHECK-NEXT: li a0, 142 ; CHECK-NEXT: vrsub.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret ; @@ -1937,7 +1937,9 @@ define @vp_ctlz_zero_undef_nxv32i16( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -1946,20 +1948,18 @@ define @vp_ctlz_zero_undef_nxv32i16( %va, ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t +; CHECK-NEXT: vand.vx v8, v16, a0, v0.t +; CHECK-NEXT: vsrl.vi v16, v16, 2, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: addi a0, a0, -241 +; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -1980,7 +1980,9 @@ define @vp_ctlz_zero_undef_nxv32i16_unmasked( @vp_ctlz_zero_undef_nxv32i16_unmasked( @vp_ctlz_zero_undef_nxv1i32( %va, @vp_ctlz_zero_undef_nxv2i32( %va, @vp_ctlz_zero_undef_nxv4i32( %va, @vp_ctlz_zero_undef_nxv8i32( %va, @vp_ctlz_zero_undef_nxv16i32( %va, ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t ; CHECK-NEXT: li a0, 158 +; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t ; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret @@ -2467,18 +2467,18 @@ define @vp_ctlz_zero_undef_nxv16i64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: fsrmi a3, 1 ; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-NEXT: sub a4, a0, a1 +; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: fsrmi a3, 1 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: sltu a2, a0, a4 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a4, a2, a4 +; CHECK-NEXT: li a2, 52 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v16, v16, v0.t ; CHECK-NEXT: fsrm a3 -; CHECK-NEXT: li a2, 52 ; CHECK-NEXT: vsrl.vx v16, v16, a2, v0.t ; CHECK-NEXT: li a3, 1086 ; CHECK-NEXT: vrsub.vx v16, v16, a3, v0.t @@ -2500,12 +2500,12 @@ define @vp_ctlz_zero_undef_nxv16i64( %va, ; CHECK-ZVBB-NEXT: vmv1r.v v24, v0 ; CHECK-ZVBB-NEXT: csrr a1, vlenb ; CHECK-ZVBB-NEXT: srli a2, a1, 3 -; CHECK-ZVBB-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-ZVBB-NEXT: sub a3, a0, a1 +; CHECK-ZVBB-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-ZVBB-NEXT: sub a2, a0, a1 -; CHECK-ZVBB-NEXT: sltu a3, a0, a2 -; CHECK-ZVBB-NEXT: addi a3, a3, -1 -; CHECK-ZVBB-NEXT: and a2, a3, a2 +; CHECK-ZVBB-NEXT: sltu a2, a0, a3 +; CHECK-ZVBB-NEXT: addi a2, a2, -1 +; CHECK-ZVBB-NEXT: and a2, a2, a3 ; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vclz.v v16, v16, v0.t ; CHECK-ZVBB-NEXT: bltu a0, a1, .LBB94_2 @@ -2524,15 +2524,15 @@ define @vp_ctlz_zero_undef_nxv16i64_unmasked( @vp_ctlz_nxv1i9( %va, @vp_ctlz_zero_undef_nxv1i9( %va, @vp_ctlo_nxv1i9( %va, @vp_ctlo_nxv1i9_vp_xor( %va, @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor( @ctpop_nxv1i8( %va) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -39,8 +39,8 @@ define @ctpop_nxv2i8( %va) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -67,8 +67,8 @@ define @ctpop_nxv4i8( %va) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -95,8 +95,8 @@ define @ctpop_nxv8i8( %va) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -123,8 +123,8 @@ define @ctpop_nxv16i8( %va) { ; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -151,8 +151,8 @@ define @ctpop_nxv32i8( %va) { ; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -179,8 +179,8 @@ define @ctpop_nxv64i8( %va) { ; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -208,17 +208,17 @@ define @ctpop_nxv1i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -243,17 +243,17 @@ define @ctpop_nxv2i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -278,17 +278,17 @@ define @ctpop_nxv4i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -313,17 +313,17 @@ define @ctpop_nxv8i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -348,17 +348,17 @@ define @ctpop_nxv16i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -383,17 +383,17 @@ define @ctpop_nxv32i16( %va) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -418,17 +418,17 @@ define @ctpop_nxv1i32( %va) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -454,17 +454,17 @@ define @ctpop_nxv2i32( %va) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -490,17 +490,17 @@ define @ctpop_nxv4i32( %va) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -526,17 +526,17 @@ define @ctpop_nxv8i32( %va) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -562,17 +562,17 @@ define @ctpop_nxv16i32( %va) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -681,31 +681,31 @@ define @ctpop_nxv1i64( %va) { ; RV32-NEXT: addi a0, a0, 1365 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -713,34 +713,34 @@ define @ctpop_nxv1i64( %va) { ; ; RV64-LABEL: ctpop_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -764,31 +764,31 @@ define @ctpop_nxv2i64( %va) { ; RV32-NEXT: addi a0, a0, 1365 ; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -796,34 +796,34 @@ define @ctpop_nxv2i64( %va) { ; ; RV64-LABEL: ctpop_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -847,31 +847,31 @@ define @ctpop_nxv4i64( %va) { ; RV32-NEXT: addi a0, a0, 1365 ; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vmv.v.x v16, a0 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vmv.v.x v16, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -879,34 +879,34 @@ define @ctpop_nxv4i64( %va) { ; ; RV64-LABEL: ctpop_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -930,66 +930,66 @@ define @ctpop_nxv8i64( %va) { ; RV32-NEXT: addi a0, a0, 1365 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vmv.v.x v24, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: ctpop_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 093eb0ead313e..9e75dc9dccffd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -17,8 +17,8 @@ define @vp_ctpop_nxv1i8( %va, @vp_ctpop_nxv1i8_unmasked( %va, i32 ze ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -73,8 +73,8 @@ define @vp_ctpop_nxv2i8( %va, @vp_ctpop_nxv2i8_unmasked( %va, i32 ze ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -129,8 +129,8 @@ define @vp_ctpop_nxv4i8( %va, @vp_ctpop_nxv4i8_unmasked( %va, i32 ze ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -185,8 +185,8 @@ define @vp_ctpop_nxv8i8( %va, @vp_ctpop_nxv8i8_unmasked( %va, i32 ze ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -241,8 +241,8 @@ define @vp_ctpop_nxv16i8( %va, @vp_ctpop_nxv16i8_unmasked( %va, i32 ; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -297,8 +297,8 @@ define @vp_ctpop_nxv32i8( %va, @vp_ctpop_nxv32i8_unmasked( %va, i32 ; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -353,8 +353,8 @@ define @vp_ctpop_nxv64i8( %va, @vp_ctpop_nxv64i8_unmasked( %va, i32 ; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -410,17 +410,17 @@ define @vp_ctpop_nxv1i16( %va, @vp_ctpop_nxv1i16_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -480,17 +480,17 @@ define @vp_ctpop_nxv2i16( %va, @vp_ctpop_nxv2i16_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -550,17 +550,17 @@ define @vp_ctpop_nxv4i16( %va, @vp_ctpop_nxv4i16_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -620,17 +620,17 @@ define @vp_ctpop_nxv8i16( %va, @vp_ctpop_nxv8i16_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -690,17 +690,17 @@ define @vp_ctpop_nxv16i16( %va, @vp_ctpop_nxv16i16_unmasked( %va, ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -760,17 +760,17 @@ define @vp_ctpop_nxv32i16( %va, @vp_ctpop_nxv32i16_unmasked( %va, ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -830,17 +830,17 @@ define @vp_ctpop_nxv1i32( %va, @vp_ctpop_nxv1i32_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -902,17 +902,17 @@ define @vp_ctpop_nxv2i32( %va, @vp_ctpop_nxv2i32_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -974,17 +974,17 @@ define @vp_ctpop_nxv4i32( %va, @vp_ctpop_nxv4i32_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1046,17 +1046,17 @@ define @vp_ctpop_nxv8i32( %va, @vp_ctpop_nxv8i32_unmasked( %va, i32 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1118,17 +1118,17 @@ define @vp_ctpop_nxv16i32( %va, @vp_ctpop_nxv16i32_unmasked( %va, ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1191,67 +1191,67 @@ define @vp_ctpop_nxv1i64( %va, @vp_ctpop_nxv1i64_unmasked( %va, i32 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1305,34 +1305,34 @@ define @vp_ctpop_nxv1i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1357,67 +1357,67 @@ define @vp_ctpop_nxv2i64( %va, @vp_ctpop_nxv2i64_unmasked( %va, i32 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1471,34 +1471,34 @@ define @vp_ctpop_nxv2i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1523,67 +1523,67 @@ define @vp_ctpop_nxv4i64( %va, @vp_ctpop_nxv4i64_unmasked( %va, i32 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1637,34 +1637,34 @@ define @vp_ctpop_nxv4i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vand.vx v12, v12, a1 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1689,67 +1689,67 @@ define @vp_ctpop_nxv7i64( %va, @vp_ctpop_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1855,67 +1855,67 @@ define @vp_ctpop_nxv8i64( %va, @vp_ctpop_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -2018,54 +2018,53 @@ define @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: sub a2, a0, a1 -; RV32-NEXT: sltu a3, a0, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: sub a4, a0, a1 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v0, a2 +; RV32-NEXT: sltu a2, a0, a4 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a4 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v24, v16, 1 -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v24, v24, v0 ; RV32-NEXT: vsub.vv v16, v16, v24 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma @@ -2383,8 +2362,10 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: vsrl.vi v24, v16, 4 ; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: addi a4, a4, 257 +; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 @@ -2393,10 +2374,8 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma @@ -2448,59 +2427,69 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: mv a2, a0 -; RV64-NEXT: bltu a0, a1, .LBB47_2 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB47_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a2, a1 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB47_2: -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 257 +; RV64-NEXT: slli a7, a3, 32 +; RV64-NEXT: add a3, a3, a7 +; RV64-NEXT: slli a7, a4, 32 +; RV64-NEXT: add a4, a4, a7 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a5, a5, a7 +; RV64-NEXT: slli a7, a6, 32 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: li a7, 56 +; RV64-NEXT: sub a2, a0, a2 +; RV64-NEXT: sltu a0, a0, a2 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v8, v8, v24 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a4 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vand.vx v16, v16, a4 ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v16, v16, a6 +; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked: @@ -2532,21 +2521,21 @@ define @vp_ctpop_nxv1i9( %va, @cttz_nxv1i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -34,6 +34,7 @@ define @cttz_nxv1i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v9 @@ -41,7 +42,6 @@ define @cttz_nxv1i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsub.vx v8, v9, a0 ; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 @@ -51,6 +51,7 @@ define @cttz_nxv1i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v9 @@ -58,7 +59,6 @@ define @cttz_nxv1i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 ; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 @@ -79,14 +79,14 @@ define @cttz_nxv2i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -100,6 +100,7 @@ define @cttz_nxv2i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v9 @@ -107,7 +108,6 @@ define @cttz_nxv2i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsub.vx v8, v9, a0 ; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 @@ -117,6 +117,7 @@ define @cttz_nxv2i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v9 @@ -124,7 +125,6 @@ define @cttz_nxv2i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 ; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 @@ -145,14 +145,14 @@ define @cttz_nxv4i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -166,6 +166,7 @@ define @cttz_nxv4i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v9 @@ -173,7 +174,6 @@ define @cttz_nxv4i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v12, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsub.vx v8, v9, a0 ; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 @@ -183,6 +183,7 @@ define @cttz_nxv4i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v9 @@ -190,7 +191,6 @@ define @cttz_nxv4i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v12, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 -; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 ; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 @@ -211,14 +211,14 @@ define @cttz_nxv8i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -232,6 +232,7 @@ define @cttz_nxv8i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vzext.vf2 v10, v9 @@ -239,7 +240,6 @@ define @cttz_nxv8i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v10, 0 -; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsub.vx v8, v9, a0 ; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 @@ -249,6 +249,7 @@ define @cttz_nxv8i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vzext.vf2 v10, v9 @@ -256,7 +257,6 @@ define @cttz_nxv8i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v10, 0 -; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 ; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 @@ -277,14 +277,14 @@ define @cttz_nxv16i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -298,6 +298,7 @@ define @cttz_nxv16i8( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vand.vv v10, v8, v10 ; CHECK-F-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vzext.vf2 v12, v10 @@ -305,7 +306,6 @@ define @cttz_nxv16i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v10, v12, 0 -; CHECK-F-NEXT: li a0, 127 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsub.vx v8, v10, a0 ; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 @@ -315,6 +315,7 @@ define @cttz_nxv16i8( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 +; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vand.vv v10, v8, v10 ; CHECK-D-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vzext.vf2 v12, v10 @@ -322,7 +323,6 @@ define @cttz_nxv16i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v10, v12, 0 -; CHECK-D-NEXT: li a0, 127 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 ; CHECK-D-NEXT: vsub.vx v8, v10, a0 ; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 @@ -343,14 +343,14 @@ define @cttz_nxv32i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -375,14 +375,14 @@ define @cttz_nxv64i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -407,24 +407,24 @@ define @cttz_nxv1i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -435,28 +435,28 @@ define @cttz_nxv1i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 -; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-F-NEXT: li a0, 127 -; CHECK-F-NEXT: vsub.vx v9, v9, a0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 -; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 -; CHECK-D-NEXT: vand.vv v9, v8, v9 -; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-D-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-D-NEXT: li a0, 127 -; CHECK-D-NEXT: vsub.vx v9, v9, a0 +; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-D-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv1i16: @@ -474,24 +474,24 @@ define @cttz_nxv2i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -502,28 +502,28 @@ define @cttz_nxv2i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 -; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-F-NEXT: li a0, 127 -; CHECK-F-NEXT: vsub.vx v9, v9, a0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 -; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 -; CHECK-D-NEXT: vand.vv v9, v8, v9 -; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-D-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-D-NEXT: li a0, 127 -; CHECK-D-NEXT: vsub.vx v9, v9, a0 +; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-D-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv2i16: @@ -541,24 +541,24 @@ define @cttz_nxv4i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -569,28 +569,28 @@ define @cttz_nxv4i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 -; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-F-NEXT: li a0, 127 -; CHECK-F-NEXT: vsub.vx v9, v9, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 -; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 -; CHECK-D-NEXT: vand.vv v9, v8, v9 -; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-D-NEXT: vnsrl.wi v9, v10, 23 ; CHECK-D-NEXT: li a0, 127 -; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v9 +; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-D-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv4i16: @@ -608,24 +608,24 @@ define @cttz_nxv8i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -636,28 +636,28 @@ define @cttz_nxv8i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 -; CHECK-F-NEXT: vand.vv v10, v8, v10 -; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 -; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-F-NEXT: li a0, 127 -; CHECK-F-NEXT: vsub.vx v10, v10, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 -; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 -; CHECK-D-NEXT: vand.vv v10, v8, v10 -; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v10 -; CHECK-D-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-D-NEXT: li a0, 127 -; CHECK-D-NEXT: vsub.vx v10, v10, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v10 +; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v8 +; CHECK-D-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 -; CHECK-D-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv8i16: @@ -675,24 +675,24 @@ define @cttz_nxv16i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v12, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v12, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v12, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -703,28 +703,28 @@ define @cttz_nxv16i16( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 -; CHECK-F-NEXT: vand.vv v12, v8, v12 -; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 -; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-F-NEXT: li a0, 127 -; CHECK-F-NEXT: vsub.vx v12, v12, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v12 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: li a0, 16 -; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 -; CHECK-D-NEXT: vand.vv v12, v8, v12 -; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v12 -; CHECK-D-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-D-NEXT: li a0, 127 -; CHECK-D-NEXT: vsub.vx v12, v12, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v12 +; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-D-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: li a0, 16 -; CHECK-D-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv16i16: @@ -742,24 +742,24 @@ define @cttz_nxv32i16( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -781,24 +781,24 @@ define @cttz_nxv1i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -810,15 +810,15 @@ define @cttz_nxv1i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9 -; CHECK-F-NEXT: vsrl.vi v9, v9, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v9, v9, a1 +; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -826,15 +826,15 @@ define @cttz_nxv1i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v9, v10, a0 ; CHECK-D-NEXT: li a0, 1023 -; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: li a0, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv1i32: @@ -852,24 +852,24 @@ define @cttz_nxv2i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -881,15 +881,15 @@ define @cttz_nxv2i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9 -; CHECK-F-NEXT: vsrl.vi v9, v9, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v9, v9, a1 +; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -897,15 +897,15 @@ define @cttz_nxv2i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v9 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v9, v10, a0 ; CHECK-D-NEXT: li a0, 1023 -; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v9, a0 ; CHECK-D-NEXT: li a0, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv2i32: @@ -923,24 +923,24 @@ define @cttz_nxv4i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -952,15 +952,15 @@ define @cttz_nxv4i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 -; CHECK-F-NEXT: vand.vv v10, v8, v10 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfcvt.f.xu.v v10, v10 -; CHECK-F-NEXT: vsrl.vi v10, v10, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v10, v10, a1 +; CHECK-F-NEXT: vand.vv v10, v8, v10 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v10 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -968,15 +968,15 @@ define @cttz_nxv4i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v10, v8, v10 ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v10 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v10, v12, a0 ; CHECK-D-NEXT: li a0, 1023 -; CHECK-D-NEXT: vsub.vx v10, v10, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v10, a0 ; CHECK-D-NEXT: li a0, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv4i32: @@ -994,24 +994,24 @@ define @cttz_nxv8i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v12, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v12, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v12, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -1023,15 +1023,15 @@ define @cttz_nxv8i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 -; CHECK-F-NEXT: vand.vv v12, v8, v12 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfcvt.f.xu.v v12, v12 -; CHECK-F-NEXT: vsrl.vi v12, v12, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v12, v12, a1 +; CHECK-F-NEXT: vand.vv v12, v8, v12 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v12 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1039,15 +1039,15 @@ define @cttz_nxv8i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v12, v8, v12 ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v12 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v12, v16, a0 ; CHECK-D-NEXT: li a0, 1023 -; CHECK-D-NEXT: vsub.vx v12, v12, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v12, a0 ; CHECK-D-NEXT: li a0, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv8i32: @@ -1065,24 +1065,24 @@ define @cttz_nxv16i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v16, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v16, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v16, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v16, v16, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vand.vx v16, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v16, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -1094,15 +1094,15 @@ define @cttz_nxv16i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 -; CHECK-F-NEXT: vand.vv v16, v8, v16 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfcvt.f.xu.v v16, v16 -; CHECK-F-NEXT: vsrl.vi v16, v16, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v16, v16, a1 +; CHECK-F-NEXT: vand.vv v16, v8, v16 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v16 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1110,15 +1110,15 @@ define @cttz_nxv16i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 -; CHECK-D-NEXT: vand.vv v16, v8, v16 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 -; CHECK-D-NEXT: vsrl.vi v16, v16, 23 ; CHECK-D-NEXT: li a1, 127 -; CHECK-D-NEXT: vsub.vx v16, v16, a1 +; CHECK-D-NEXT: vand.vv v16, v8, v16 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v16 +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; @@ -1137,39 +1137,39 @@ define @cttz_nxv1i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32I-NEXT: vsub.vx v9, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vnot.v v9, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1178,37 +1178,37 @@ define @cttz_nxv1i64( %va) { ; RV64I-LABEL: cttz_nxv1i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m1, ta, ma ; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v9, v9, a0 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vx v9, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1217,17 +1217,17 @@ define @cttz_nxv1i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v10, v9 -; CHECK-F-NEXT: vsrl.vi v9, v10, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v10, v9, a1 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vand.vv v9, v8, v9 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v8, v9 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vwsubu.vx v9, v8, a1 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1235,16 +1235,16 @@ define @cttz_nxv1i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 -; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v9, v9 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v9, v8, v9 +; CHECK-D-NEXT: vfcvt.f.xu.v v9, v9 ; CHECK-D-NEXT: vsrl.vx v9, v9, a1 ; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v9, v9, a1 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v9, a1 ; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; @@ -1263,39 +1263,39 @@ define @cttz_nxv2i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32I-NEXT: vsub.vx v10, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vnot.v v10, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1304,37 +1304,37 @@ define @cttz_nxv2i64( %va) { ; RV64I-LABEL: cttz_nxv2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m2, ta, ma ; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v10, v10, a0 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vx v10, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1343,17 +1343,17 @@ define @cttz_nxv2i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 -; CHECK-F-NEXT: vand.vv v10, v8, v10 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v12, v10 -; CHECK-F-NEXT: vsrl.vi v10, v12, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v12, v10, a1 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vand.vv v10, v8, v10 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v8, v10 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vwsubu.vx v10, v8, a1 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1361,16 +1361,16 @@ define @cttz_nxv2i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 -; CHECK-D-NEXT: vand.vv v10, v8, v10 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v10, v10 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v10, v8, v10 +; CHECK-D-NEXT: vfcvt.f.xu.v v10, v10 ; CHECK-D-NEXT: vsrl.vx v10, v10, a1 ; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v10, v10, a1 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v10, a1 ; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; @@ -1389,39 +1389,39 @@ define @cttz_nxv4i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32I-NEXT: vsub.vx v12, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vnot.v v12, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v12, v12, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1430,37 +1430,37 @@ define @cttz_nxv4i64( %va) { ; RV64I-LABEL: cttz_nxv4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m4, ta, ma ; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v12, v12, a0 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vx v12, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1469,17 +1469,17 @@ define @cttz_nxv4i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 -; CHECK-F-NEXT: vand.vv v12, v8, v12 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v16, v12 -; CHECK-F-NEXT: vsrl.vi v12, v16, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v16, v12, a1 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vand.vv v12, v8, v12 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v8, v12 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vwsubu.vx v12, v8, a1 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1487,16 +1487,16 @@ define @cttz_nxv4i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 -; CHECK-D-NEXT: vand.vv v12, v8, v12 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v12, v12 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v12, v8, v12 +; CHECK-D-NEXT: vfcvt.f.xu.v v12, v12 ; CHECK-D-NEXT: vsrl.vx v12, v12, a1 ; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v12, v12, a1 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v12, a1 ; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; @@ -1515,40 +1515,40 @@ define @cttz_nxv8i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32I-NEXT: vsub.vx v16, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vnot.v v16, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vand.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v24, v16, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v24, v8, v16 ; RV32I-NEXT: vsrl.vi v8, v8, 2 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v24, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret @@ -1556,37 +1556,37 @@ define @cttz_nxv8i64( %va) { ; RV64I-LABEL: cttz_nxv8i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v16, v16, a0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vx v16, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1595,17 +1595,17 @@ define @cttz_nxv8i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 -; CHECK-F-NEXT: vand.vv v16, v8, v16 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v24, v16 -; CHECK-F-NEXT: vsrl.vi v16, v24, 23 ; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v24, v16, a1 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vand.vv v16, v8, v16 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v8, v16 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vwsubu.vx v16, v8, a1 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vmerge.vxm v8, v24, a1, v0 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1613,16 +1613,16 @@ define @cttz_nxv8i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 -; CHECK-D-NEXT: vand.vv v16, v8, v16 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v16, v8, v16 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 ; CHECK-D-NEXT: vsrl.vx v16, v16, a1 ; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v16, v16, a1 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v16, a1 ; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 ; CHECK-D-NEXT: ret ; @@ -1641,14 +1641,14 @@ define @cttz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1702,14 +1702,14 @@ define @cttz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1763,14 +1763,14 @@ define @cttz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1824,14 +1824,14 @@ define @cttz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1885,14 +1885,14 @@ define @cttz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 85 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: li a0, 51 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 @@ -1946,14 +1946,14 @@ define @cttz_zero_undef_nxv32i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -1977,14 +1977,14 @@ define @cttz_zero_undef_nxv64i8( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2008,24 +2008,24 @@ define @cttz_zero_undef_nxv1i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2068,24 +2068,24 @@ define @cttz_zero_undef_nxv2i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2128,24 +2128,24 @@ define @cttz_zero_undef_nxv4i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2188,24 +2188,24 @@ define @cttz_zero_undef_nxv8i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2248,24 +2248,24 @@ define @cttz_zero_undef_nxv16i16( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v12, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v12, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 5 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v12, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 3 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 1 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 1 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: li a0, 257 ; CHECK-ZVE64X-NEXT: vmul.vx v8, v8, a0 @@ -2308,24 +2308,24 @@ define @cttz_zero_undef_nxv32i16( %va) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a0 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2346,24 +2346,24 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2375,8 +2375,8 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 127 @@ -2388,9 +2388,9 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v9 ; CHECK-D-NEXT: vfwcvt.f.xu.v v9, v8 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v8, v9, a0 ; CHECK-D-NEXT: li a0, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a0 @@ -2410,24 +2410,24 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v9, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v9, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v9, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v9, v9, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v9 ; CHECK-ZVE64X-NEXT: vand.vx v9, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v9, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v9, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v9 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2439,8 +2439,8 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 127 @@ -2452,9 +2452,9 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v9 ; CHECK-D-NEXT: vfwcvt.f.xu.v v10, v8 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v8, v10, a0 ; CHECK-D-NEXT: li a0, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a0 @@ -2474,24 +2474,24 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v10, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v10, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v10, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v10, v10, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v10 ; CHECK-ZVE64X-NEXT: vand.vx v10, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v10, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v10, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v10 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2503,8 +2503,8 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 127 @@ -2516,9 +2516,9 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v10 ; CHECK-D-NEXT: vfwcvt.f.xu.v v12, v8 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v8, v12, a0 ; CHECK-D-NEXT: li a0, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a0 @@ -2538,24 +2538,24 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v12, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v12, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v12, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v12, v12, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v12 ; CHECK-ZVE64X-NEXT: vand.vx v12, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v12, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v12, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v12 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2567,8 +2567,8 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 127 @@ -2580,9 +2580,9 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 +; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v12 ; CHECK-D-NEXT: vfwcvt.f.xu.v v16, v8 -; CHECK-D-NEXT: li a0, 52 ; CHECK-D-NEXT: vnsrl.wx v8, v16, a0 ; CHECK-D-NEXT: li a0, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a0 @@ -2602,24 +2602,24 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-ZVE64X: # %bb.0: ; CHECK-ZVE64X-NEXT: li a0, 1 ; CHECK-ZVE64X-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; CHECK-ZVE64X-NEXT: vsub.vx v16, v8, a0 -; CHECK-ZVE64X-NEXT: vnot.v v8, v8 -; CHECK-ZVE64X-NEXT: vand.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 +; CHECK-ZVE64X-NEXT: vnot.v v16, v8 +; CHECK-ZVE64X-NEXT: vsub.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 349525 ; CHECK-ZVE64X-NEXT: addi a0, a0, 1365 +; CHECK-ZVE64X-NEXT: vand.vv v8, v16, v8 +; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 1 ; CHECK-ZVE64X-NEXT: vand.vx v16, v16, a0 -; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: lui a0, 209715 ; CHECK-ZVE64X-NEXT: addi a0, a0, 819 +; CHECK-ZVE64X-NEXT: vsub.vv v8, v8, v16 ; CHECK-ZVE64X-NEXT: vand.vx v16, v8, a0 ; CHECK-ZVE64X-NEXT: vsrl.vi v8, v8, 2 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 +; CHECK-ZVE64X-NEXT: lui a0, 61681 +; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v16, v8 ; CHECK-ZVE64X-NEXT: vsrl.vi v16, v8, 4 ; CHECK-ZVE64X-NEXT: vadd.vv v8, v8, v16 -; CHECK-ZVE64X-NEXT: lui a0, 61681 -; CHECK-ZVE64X-NEXT: addi a0, a0, -241 ; CHECK-ZVE64X-NEXT: vand.vx v8, v8, a0 ; CHECK-ZVE64X-NEXT: lui a0, 4112 ; CHECK-ZVE64X-NEXT: addi a0, a0, 257 @@ -2631,8 +2631,8 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-F-NEXT: vsrl.vi v8, v8, 23 ; CHECK-F-NEXT: li a1, 127 @@ -2644,8 +2644,8 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 -; CHECK-D-NEXT: vand.vv v8, v8, v16 ; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vand.vv v8, v8, v16 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: vsrl.vi v8, v8, 23 ; CHECK-D-NEXT: li a1, 127 @@ -2667,39 +2667,39 @@ define @cttz_zero_undef_nxv1i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32I-NEXT: vsub.vx v9, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vnot.v v9, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2708,37 +2708,37 @@ define @cttz_zero_undef_nxv1i64( %va) { ; RV64I-LABEL: cttz_zero_undef_nxv1i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m1, ta, ma ; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v9, v9, a0 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vx v9, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2747,8 +2747,8 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 ; CHECK-F-NEXT: vsrl.vi v9, v9, 23 @@ -2761,10 +2761,10 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 -; CHECK-D-NEXT: vand.vv v8, v8, v9 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v8, v8, v9 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: vsrl.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a1 @@ -2785,39 +2785,39 @@ define @cttz_zero_undef_nxv2i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32I-NEXT: vsub.vx v10, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vnot.v v10, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2826,37 +2826,37 @@ define @cttz_zero_undef_nxv2i64( %va) { ; RV64I-LABEL: cttz_zero_undef_nxv2i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m2, ta, ma ; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v10, v10, a0 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vx v10, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2865,8 +2865,8 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 ; CHECK-F-NEXT: vsrl.vi v10, v10, 23 @@ -2879,10 +2879,10 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 -; CHECK-D-NEXT: vand.vv v8, v8, v10 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v8, v8, v10 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: vsrl.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a1 @@ -2903,39 +2903,39 @@ define @cttz_zero_undef_nxv4i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32I-NEXT: vsub.vx v12, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vnot.v v12, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v12, v12, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2944,37 +2944,37 @@ define @cttz_zero_undef_nxv4i64( %va) { ; RV64I-LABEL: cttz_zero_undef_nxv4i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m4, ta, ma ; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v12 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v12, v12, a0 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vx v12, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2983,8 +2983,8 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 ; CHECK-F-NEXT: vsrl.vi v12, v12, 23 @@ -2997,10 +2997,10 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 -; CHECK-D-NEXT: vand.vv v8, v8, v12 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v8, v8, v12 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: vsrl.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a1 @@ -3021,40 +3021,40 @@ define @cttz_zero_undef_nxv8i64( %va) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a0, 1 ; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32I-NEXT: vsub.vx v16, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vnot.v v16, v8 +; RV32I-NEXT: vsub.vx v8, v8, a0 ; RV32I-NEXT: lui a0, 349525 ; RV32I-NEXT: addi a0, a0, 1365 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 209715 ; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vand.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v24, v16, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v24, v8, v16 ; RV32I-NEXT: vsrl.vi v8, v8, 2 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 ; RV32I-NEXT: lui a0, 4112 ; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v24, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 ; RV32I-NEXT: ret @@ -3062,37 +3062,37 @@ define @cttz_zero_undef_nxv8i64( %va) { ; RV64I-LABEL: cttz_zero_undef_nxv8i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: addiw a0, a1, 1365 +; RV64I-NEXT: addiw a1, a2, 819 +; RV64I-NEXT: addiw a2, a3, -241 +; RV64I-NEXT: addiw a3, a4, 257 +; RV64I-NEXT: slli a4, a0, 32 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a4, a1, 32 +; RV64I-NEXT: add a1, a1, a4 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v16 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v16, v16, a0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vx v16, v8, a1 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a1 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vand.vx v8, v8, a2 +; RV64I-NEXT: vmul.vx v8, v8, a3 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3101,8 +3101,8 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 -; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 ; CHECK-F-NEXT: vsrl.vi v16, v16, 23 @@ -3115,10 +3115,10 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 -; CHECK-D-NEXT: vand.vv v8, v8, v16 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vand.vv v8, v8, v16 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 ; CHECK-D-NEXT: vsrl.vx v8, v8, a1 ; CHECK-D-NEXT: li a1, 1023 ; CHECK-D-NEXT: vsub.vx v8, v8, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 619c05dd8ab74..9e6295b664417 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -16,13 +16,13 @@ define @vp_cttz_nxv1i8( %va, @vp_cttz_nxv1i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -80,13 +80,13 @@ define @vp_cttz_nxv2i8( %va, @vp_cttz_nxv2i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -144,13 +144,13 @@ define @vp_cttz_nxv4i8( %va, @vp_cttz_nxv4i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -208,13 +208,13 @@ define @vp_cttz_nxv8i8( %va, @vp_cttz_nxv8i8_unmasked( %va, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -272,13 +272,13 @@ define @vp_cttz_nxv16i8( %va, @vp_cttz_nxv16i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -336,13 +336,13 @@ define @vp_cttz_nxv32i8( %va, @vp_cttz_nxv32i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -400,13 +400,13 @@ define @vp_cttz_nxv64i8( %va, @vp_cttz_nxv64i8_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -464,23 +464,23 @@ define @vp_cttz_nxv1i16( %va, @vp_cttz_nxv1i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -542,23 +542,23 @@ define @vp_cttz_nxv2i16( %va, @vp_cttz_nxv2i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -620,23 +620,23 @@ define @vp_cttz_nxv4i16( %va, @vp_cttz_nxv4i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -698,23 +698,23 @@ define @vp_cttz_nxv8i16( %va, @vp_cttz_nxv8i16_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -776,23 +776,23 @@ define @vp_cttz_nxv16i16( %va, @vp_cttz_nxv16i16_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -854,23 +854,23 @@ define @vp_cttz_nxv32i16( %va, @vp_cttz_nxv32i16_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -932,23 +932,23 @@ define @vp_cttz_nxv1i32( %va, @vp_cttz_nxv1i32_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1012,23 +1012,23 @@ define @vp_cttz_nxv2i32( %va, @vp_cttz_nxv2i32_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1092,23 +1092,23 @@ define @vp_cttz_nxv4i32( %va, @vp_cttz_nxv4i32_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1172,23 +1172,23 @@ define @vp_cttz_nxv8i32( %va, @vp_cttz_nxv8i32_unmasked( %va, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1252,23 +1252,23 @@ define @vp_cttz_nxv16i32( %va, @vp_cttz_nxv16i32_unmasked( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vnot.v v16, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vsrl.vi v16, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -1332,78 +1332,78 @@ define @vp_cttz_nxv1i64( %va, @vp_cttz_nxv1i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vnot.v v9, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vand.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1462,37 +1462,37 @@ define @vp_cttz_nxv1i64_unmasked( %va, i32 ; RV64-LABEL: vp_cttz_nxv1i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1514,78 +1514,78 @@ define @vp_cttz_nxv2i64( %va, @vp_cttz_nxv2i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vnot.v v10, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1644,37 +1644,37 @@ define @vp_cttz_nxv2i64_unmasked( %va, i32 ; RV64-LABEL: vp_cttz_nxv2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1696,78 +1696,78 @@ define @vp_cttz_nxv4i64( %va, @vp_cttz_nxv4i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vnot.v v12, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1826,37 +1826,37 @@ define @vp_cttz_nxv4i64_unmasked( %va, i32 ; RV64-LABEL: vp_cttz_nxv4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1878,78 +1878,78 @@ define @vp_cttz_nxv7i64( %va, @vp_cttz_nxv7i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vnot.v v16, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret @@ -2008,37 +2008,37 @@ define @vp_cttz_nxv7i64_unmasked( %va, i32 ; RV64-LABEL: vp_cttz_nxv7i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -2060,78 +2060,78 @@ define @vp_cttz_nxv8i64( %va, @vp_cttz_nxv8i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vnot.v v16, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret @@ -2190,37 +2190,37 @@ define @vp_cttz_nxv8i64_unmasked( %va, i32 ; RV64-LABEL: vp_cttz_nxv8i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -2253,37 +2253,44 @@ define @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: sub a2, a0, a1 -; RV32-NEXT: sltu a3, a0, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a3, a3, a2 ; RV32-NEXT: li a2, 1 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: lui a4, 209715 +; RV32-NEXT: sub a5, a0, a1 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: addi a4, a4, 819 +; RV32-NEXT: vsetvli a6, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v0, a3 +; RV32-NEXT: sltu a3, a0, a5 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a3, a3, a5 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v24, v16, a2 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsrl.vi v24, v16, 1 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 24 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v24, v24, v0 ; RV32-NEXT: vsub.vv v16, v16, v24 -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 ; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a4 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -2653,8 +2658,10 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vsrl.vi v24, v16, 4 ; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a4, 61681 +; RV32-NEXT: lui a5, 4112 ; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma +; RV32-NEXT: addi a5, a5, 257 +; RV32-NEXT: vsetvli a6, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 3 @@ -2663,10 +2670,8 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a5 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -2722,42 +2727,42 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV64-LABEL: vp_cttz_nxv16i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: sub a2, a0, a1 -; RV64-NEXT: sltu a3, a0, a2 -; RV64-NEXT: addi a3, a3, -1 -; RV64-NEXT: and a3, a3, a2 ; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: sub a7, a0, a1 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw t0, a5, -241 +; RV64-NEXT: addiw t1, a6, 257 +; RV64-NEXT: slli a6, a3, 32 +; RV64-NEXT: add a6, a3, a6 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a5, a4, a5 +; RV64-NEXT: slli a3, t0, 32 +; RV64-NEXT: add a3, t0, a3 +; RV64-NEXT: slli a4, t1, 32 +; RV64-NEXT: add a4, t1, a4 +; RV64-NEXT: sltu t0, a0, a7 +; RV64-NEXT: addi t0, t0, -1 +; RV64-NEXT: and a7, t0, a7 +; RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v24, v16, a2 ; RV64-NEXT: vnot.v v16, v16 ; RV64-NEXT: vand.vv v16, v16, v24 ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: lui a3, 349525 -; RV64-NEXT: addiw a3, a3, 1365 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vand.vx v24, v24, a6 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: lui a4, 209715 -; RV64-NEXT: addiw a4, a4, 819 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v24, v16, a4 +; RV64-NEXT: vand.vx v24, v16, a5 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vand.vx v16, v16, a5 ; RV64-NEXT: vadd.vv v16, v24, v16 ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: lui a5, 61681 -; RV64-NEXT: addiw a5, a5, -241 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vand.vx v16, v16, a5 -; RV64-NEXT: lui a6, 4112 -; RV64-NEXT: addiw a6, a6, 257 -; RV64-NEXT: slli a7, a6, 32 -; RV64-NEXT: add a6, a6, a7 -; RV64-NEXT: vmul.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vmul.vx v16, v16, a4 ; RV64-NEXT: li a7, 56 ; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: bltu a0, a1, .LBB47_2 @@ -2769,16 +2774,16 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vand.vx v24, v24, a6 ; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vand.vx v24, v8, a5 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vand.vx v8, v8, a5 ; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vsrl.vi v24, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: vand.vx v8, v8, a5 -; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: vsrl.vx v8, v8, a7 ; RV64-NEXT: ret ; @@ -2807,6 +2812,7 @@ define @vp_cttz_zero_undef_nxv1i8( %va, @vp_cttz_zero_undef_nxv1i8( %va, @vp_cttz_zero_undef_nxv2i8( %va, @vp_cttz_zero_undef_nxv2i8( %va, @vp_cttz_zero_undef_nxv4i8( %va, @vp_cttz_zero_undef_nxv4i8( %va, @vp_cttz_zero_undef_nxv8i8( %va, @vp_cttz_zero_undef_nxv8i8( %va, @vp_cttz_zero_undef_nxv16i8( %va, @vp_cttz_zero_undef_nxv16i8( %va, @vp_cttz_zero_undef_nxv32i8( %va, @vp_cttz_zero_undef_nxv32i8_unmasked( @vp_cttz_zero_undef_nxv64i8( %va, @vp_cttz_zero_undef_nxv64i8_unmasked( @vp_cttz_zero_undef_nxv1i16( %va, @vp_cttz_zero_undef_nxv2i16( %va, @vp_cttz_zero_undef_nxv4i16( %va, @vp_cttz_zero_undef_nxv8i16( %va, @vp_cttz_zero_undef_nxv16i16( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vrsub.vi v12, v8, 0, v0.t +; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vand.vv v8, v8, v12, v0.t ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vsrl.vi v8, v16, 23, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t -; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vsub.vx v8, v16, a0, v0.t ; CHECK-NEXT: ret ; @@ -3434,23 +3439,23 @@ define @vp_cttz_zero_undef_nxv32i16( %va, ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsub.vx v16, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsub.vv v16, v8, v16, v0.t +; CHECK-NEXT: vand.vx v8, v16, a0, v0.t +; CHECK-NEXT: vsrl.vi v16, v16, 2, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: addi a0, a0, -241 +; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -3471,24 +3476,24 @@ define @vp_cttz_zero_undef_nxv32i16_unmasked( @vp_cttz_zero_undef_nxv1i32( %va, @vp_cttz_zero_undef_nxv1i32_unmasked( @vp_cttz_zero_undef_nxv2i32( %va, @vp_cttz_zero_undef_nxv2i32_unmasked( @vp_cttz_zero_undef_nxv4i32( %va, @vp_cttz_zero_undef_nxv4i32_unmasked( @vp_cttz_zero_undef_nxv8i32( %va, @vp_cttz_zero_undef_nxv8i32_unmasked( @vp_cttz_zero_undef_nxv16i32( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0, v0.t -; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: li a1, 127 +; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t -; CHECK-NEXT: li a1, 127 ; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: ret @@ -3725,8 +3730,8 @@ define @vp_cttz_zero_undef_nxv16i32_unmasked( @vp_cttz_zero_undef_nxv1i64( %va, @vp_cttz_zero_undef_nxv1i64_unmasked( @vp_cttz_zero_undef_nxv2i64( %va, @vp_cttz_zero_undef_nxv2i64_unmasked( @vp_cttz_zero_undef_nxv4i64( %va, @vp_cttz_zero_undef_nxv4i64_unmasked( @vp_cttz_zero_undef_nxv7i64( %va, @vp_cttz_zero_undef_nxv7i64_unmasked( @vp_cttz_zero_undef_nxv8i64( %va, @vp_cttz_zero_undef_nxv8i64_unmasked( @vp_cttz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: fsrmi a3, 1 ; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-NEXT: sub a4, a0, a1 +; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: sltu a2, a0, a4 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a4, a2, a4 +; CHECK-NEXT: li a2, 52 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v8, v16, 0, v0.t ; CHECK-NEXT: vand.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: li a2, 52 +; CHECK-NEXT: fsrm a3 ; CHECK-NEXT: vsrl.vx v8, v8, a2, v0.t ; CHECK-NEXT: li a3, 1023 ; CHECK-NEXT: vsub.vx v8, v8, a3, v0.t @@ -4034,8 +4039,8 @@ define @vp_cttz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0, v0.t -; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsub.vx v8, v8, a3, v0.t @@ -4055,12 +4060,12 @@ define @vp_cttz_zero_undef_nxv16i64( %va, ; CHECK-ZVBB-NEXT: vmv1r.v v24, v0 ; CHECK-ZVBB-NEXT: csrr a1, vlenb ; CHECK-ZVBB-NEXT: srli a2, a1, 3 -; CHECK-ZVBB-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-ZVBB-NEXT: sub a3, a0, a1 +; CHECK-ZVBB-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-ZVBB-NEXT: sub a2, a0, a1 -; CHECK-ZVBB-NEXT: sltu a3, a0, a2 -; CHECK-ZVBB-NEXT: addi a3, a3, -1 -; CHECK-ZVBB-NEXT: and a2, a3, a2 +; CHECK-ZVBB-NEXT: sltu a2, a0, a3 +; CHECK-ZVBB-NEXT: addi a2, a2, -1 +; CHECK-ZVBB-NEXT: and a2, a2, a3 ; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-ZVBB-NEXT: vctz.v v16, v16, v0.t ; CHECK-ZVBB-NEXT: bltu a0, a1, .LBB94_2 @@ -4079,17 +4084,17 @@ define @vp_cttz_zero_undef_nxv16i64_unmasked( @vp_zero_undef_cttz_nxv1i9( %va, This Inner Loop Header: Depth=1 ; NO-SINK-NEXT: vl2re32.v v10, (a6) +; NO-SINK-NEXT: sub a7, a7, a3 ; NO-SINK-NEXT: vadd.vv v10, v10, v8 ; NO-SINK-NEXT: vs2r.v v10, (a6) -; NO-SINK-NEXT: sub a7, a7, a2 ; NO-SINK-NEXT: add a6, a6, a5 ; NO-SINK-NEXT: bnez a7, .LBB1_3 ; NO-SINK-NEXT: # %bb.4: # %middle.block ; NO-SINK-NEXT: beqz a4, .LBB1_7 ; NO-SINK-NEXT: .LBB1_5: # %for.body.preheader -; NO-SINK-NEXT: slli a2, a3, 2 -; NO-SINK-NEXT: add a2, a0, a2 +; NO-SINK-NEXT: slli a2, a2, 2 ; NO-SINK-NEXT: lui a3, 1 +; NO-SINK-NEXT: add a2, a0, a2 ; NO-SINK-NEXT: add a0, a0, a3 ; NO-SINK-NEXT: .LBB1_6: # %for.body ; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -130,34 +130,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) { ; SINK-LABEL: sink_splat_add_scalable: ; SINK: # %bb.0: # %entry ; SINK-NEXT: csrr a5, vlenb -; SINK-NEXT: srli a2, a5, 1 -; SINK-NEXT: li a3, 1024 -; SINK-NEXT: bgeu a3, a2, .LBB1_2 +; SINK-NEXT: srli a3, a5, 1 +; SINK-NEXT: li a2, 1024 +; SINK-NEXT: bgeu a2, a3, .LBB1_2 ; SINK-NEXT: # %bb.1: -; SINK-NEXT: li a3, 0 +; SINK-NEXT: li a2, 0 ; SINK-NEXT: j .LBB1_5 ; SINK-NEXT: .LBB1_2: # %vector.ph -; SINK-NEXT: addi a3, a2, -1 -; SINK-NEXT: andi a4, a3, 1024 -; SINK-NEXT: xori a3, a4, 1024 +; SINK-NEXT: addi a2, a3, -1 +; SINK-NEXT: andi a4, a2, 1024 +; SINK-NEXT: xori a2, a4, 1024 ; SINK-NEXT: slli a5, a5, 1 ; SINK-NEXT: mv a6, a0 -; SINK-NEXT: mv a7, a3 +; SINK-NEXT: mv a7, a2 ; SINK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; SINK-NEXT: .LBB1_3: # %vector.body ; SINK-NEXT: # =>This Inner Loop Header: Depth=1 ; SINK-NEXT: vl2re32.v v8, (a6) +; SINK-NEXT: sub a7, a7, a3 ; SINK-NEXT: vadd.vx v8, v8, a1 ; SINK-NEXT: vs2r.v v8, (a6) -; SINK-NEXT: sub a7, a7, a2 ; SINK-NEXT: add a6, a6, a5 ; SINK-NEXT: bnez a7, .LBB1_3 ; SINK-NEXT: # %bb.4: # %middle.block ; SINK-NEXT: beqz a4, .LBB1_7 ; SINK-NEXT: .LBB1_5: # %for.body.preheader -; SINK-NEXT: slli a2, a3, 2 -; SINK-NEXT: add a2, a0, a2 +; SINK-NEXT: slli a2, a2, 2 ; SINK-NEXT: lui a3, 1 +; SINK-NEXT: add a2, a0, a2 ; SINK-NEXT: add a0, a0, a3 ; SINK-NEXT: .LBB1_6: # %for.body ; SINK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -172,34 +172,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) { ; DEFAULT-LABEL: sink_splat_add_scalable: ; DEFAULT: # %bb.0: # %entry ; DEFAULT-NEXT: csrr a5, vlenb -; DEFAULT-NEXT: srli a2, a5, 1 -; DEFAULT-NEXT: li a3, 1024 -; DEFAULT-NEXT: bgeu a3, a2, .LBB1_2 +; DEFAULT-NEXT: srli a3, a5, 1 +; DEFAULT-NEXT: li a2, 1024 +; DEFAULT-NEXT: bgeu a2, a3, .LBB1_2 ; DEFAULT-NEXT: # %bb.1: -; DEFAULT-NEXT: li a3, 0 +; DEFAULT-NEXT: li a2, 0 ; DEFAULT-NEXT: j .LBB1_5 ; DEFAULT-NEXT: .LBB1_2: # %vector.ph -; DEFAULT-NEXT: addi a3, a2, -1 -; DEFAULT-NEXT: andi a4, a3, 1024 -; DEFAULT-NEXT: xori a3, a4, 1024 +; DEFAULT-NEXT: addi a2, a3, -1 +; DEFAULT-NEXT: andi a4, a2, 1024 +; DEFAULT-NEXT: xori a2, a4, 1024 ; DEFAULT-NEXT: slli a5, a5, 1 ; DEFAULT-NEXT: mv a6, a0 -; DEFAULT-NEXT: mv a7, a3 +; DEFAULT-NEXT: mv a7, a2 ; DEFAULT-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; DEFAULT-NEXT: .LBB1_3: # %vector.body ; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 ; DEFAULT-NEXT: vl2re32.v v8, (a6) +; DEFAULT-NEXT: sub a7, a7, a3 ; DEFAULT-NEXT: vadd.vx v8, v8, a1 ; DEFAULT-NEXT: vs2r.v v8, (a6) -; DEFAULT-NEXT: sub a7, a7, a2 ; DEFAULT-NEXT: add a6, a6, a5 ; DEFAULT-NEXT: bnez a7, .LBB1_3 ; DEFAULT-NEXT: # %bb.4: # %middle.block ; DEFAULT-NEXT: beqz a4, .LBB1_7 ; DEFAULT-NEXT: .LBB1_5: # %for.body.preheader -; DEFAULT-NEXT: slli a2, a3, 2 -; DEFAULT-NEXT: add a2, a0, a2 +; DEFAULT-NEXT: slli a2, a2, 2 ; DEFAULT-NEXT: lui a3, 1 +; DEFAULT-NEXT: add a2, a0, a2 ; DEFAULT-NEXT: add a0, a0, a3 ; DEFAULT-NEXT: .LBB1_6: # %for.body ; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 @@ -407,34 +407,34 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; NO-SINK-LABEL: sink_splat_fadd_scalable: ; NO-SINK: # %bb.0: # %entry ; NO-SINK-NEXT: csrr a1, vlenb -; NO-SINK-NEXT: srli a2, a1, 2 -; NO-SINK-NEXT: li a3, 1024 -; NO-SINK-NEXT: bgeu a3, a2, .LBB4_2 +; NO-SINK-NEXT: srli a3, a1, 2 +; NO-SINK-NEXT: li a2, 1024 +; NO-SINK-NEXT: bgeu a2, a3, .LBB4_2 ; NO-SINK-NEXT: # %bb.1: -; NO-SINK-NEXT: li a3, 0 +; NO-SINK-NEXT: li a2, 0 ; NO-SINK-NEXT: j .LBB4_5 ; NO-SINK-NEXT: .LBB4_2: # %vector.ph -; NO-SINK-NEXT: addi a3, a2, -1 -; NO-SINK-NEXT: andi a4, a3, 1024 -; NO-SINK-NEXT: xori a3, a4, 1024 +; NO-SINK-NEXT: addi a2, a3, -1 +; NO-SINK-NEXT: andi a4, a2, 1024 +; NO-SINK-NEXT: xori a2, a4, 1024 ; NO-SINK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; NO-SINK-NEXT: vfmv.v.f v8, fa0 ; NO-SINK-NEXT: mv a5, a0 -; NO-SINK-NEXT: mv a6, a3 +; NO-SINK-NEXT: mv a6, a2 ; NO-SINK-NEXT: .LBB4_3: # %vector.body ; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 ; NO-SINK-NEXT: vl1re32.v v9, (a5) +; NO-SINK-NEXT: sub a6, a6, a3 ; NO-SINK-NEXT: vfadd.vv v9, v9, v8 ; NO-SINK-NEXT: vs1r.v v9, (a5) -; NO-SINK-NEXT: sub a6, a6, a2 ; NO-SINK-NEXT: add a5, a5, a1 ; NO-SINK-NEXT: bnez a6, .LBB4_3 ; NO-SINK-NEXT: # %bb.4: # %middle.block ; NO-SINK-NEXT: beqz a4, .LBB4_7 ; NO-SINK-NEXT: .LBB4_5: # %for.body.preheader -; NO-SINK-NEXT: slli a1, a3, 2 -; NO-SINK-NEXT: add a1, a0, a1 +; NO-SINK-NEXT: slli a1, a2, 2 ; NO-SINK-NEXT: lui a2, 1 +; NO-SINK-NEXT: add a1, a0, a1 ; NO-SINK-NEXT: add a0, a0, a2 ; NO-SINK-NEXT: .LBB4_6: # %for.body ; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -449,33 +449,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; SINK-LABEL: sink_splat_fadd_scalable: ; SINK: # %bb.0: # %entry ; SINK-NEXT: csrr a1, vlenb -; SINK-NEXT: srli a2, a1, 2 -; SINK-NEXT: li a3, 1024 -; SINK-NEXT: bgeu a3, a2, .LBB4_2 +; SINK-NEXT: srli a3, a1, 2 +; SINK-NEXT: li a2, 1024 +; SINK-NEXT: bgeu a2, a3, .LBB4_2 ; SINK-NEXT: # %bb.1: -; SINK-NEXT: li a3, 0 +; SINK-NEXT: li a2, 0 ; SINK-NEXT: j .LBB4_5 ; SINK-NEXT: .LBB4_2: # %vector.ph -; SINK-NEXT: addi a3, a2, -1 -; SINK-NEXT: andi a4, a3, 1024 -; SINK-NEXT: xori a3, a4, 1024 +; SINK-NEXT: addi a2, a3, -1 +; SINK-NEXT: andi a4, a2, 1024 +; SINK-NEXT: xori a2, a4, 1024 ; SINK-NEXT: mv a5, a0 -; SINK-NEXT: mv a6, a3 +; SINK-NEXT: mv a6, a2 ; SINK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; SINK-NEXT: .LBB4_3: # %vector.body ; SINK-NEXT: # =>This Inner Loop Header: Depth=1 ; SINK-NEXT: vl1re32.v v8, (a5) +; SINK-NEXT: sub a6, a6, a3 ; SINK-NEXT: vfadd.vf v8, v8, fa0 ; SINK-NEXT: vs1r.v v8, (a5) -; SINK-NEXT: sub a6, a6, a2 ; SINK-NEXT: add a5, a5, a1 ; SINK-NEXT: bnez a6, .LBB4_3 ; SINK-NEXT: # %bb.4: # %middle.block ; SINK-NEXT: beqz a4, .LBB4_7 ; SINK-NEXT: .LBB4_5: # %for.body.preheader -; SINK-NEXT: slli a1, a3, 2 -; SINK-NEXT: add a1, a0, a1 +; SINK-NEXT: slli a1, a2, 2 ; SINK-NEXT: lui a2, 1 +; SINK-NEXT: add a1, a0, a1 ; SINK-NEXT: add a0, a0, a2 ; SINK-NEXT: .LBB4_6: # %for.body ; SINK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -490,33 +490,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; DEFAULT-LABEL: sink_splat_fadd_scalable: ; DEFAULT: # %bb.0: # %entry ; DEFAULT-NEXT: csrr a1, vlenb -; DEFAULT-NEXT: srli a2, a1, 2 -; DEFAULT-NEXT: li a3, 1024 -; DEFAULT-NEXT: bgeu a3, a2, .LBB4_2 +; DEFAULT-NEXT: srli a3, a1, 2 +; DEFAULT-NEXT: li a2, 1024 +; DEFAULT-NEXT: bgeu a2, a3, .LBB4_2 ; DEFAULT-NEXT: # %bb.1: -; DEFAULT-NEXT: li a3, 0 +; DEFAULT-NEXT: li a2, 0 ; DEFAULT-NEXT: j .LBB4_5 ; DEFAULT-NEXT: .LBB4_2: # %vector.ph -; DEFAULT-NEXT: addi a3, a2, -1 -; DEFAULT-NEXT: andi a4, a3, 1024 -; DEFAULT-NEXT: xori a3, a4, 1024 +; DEFAULT-NEXT: addi a2, a3, -1 +; DEFAULT-NEXT: andi a4, a2, 1024 +; DEFAULT-NEXT: xori a2, a4, 1024 ; DEFAULT-NEXT: mv a5, a0 -; DEFAULT-NEXT: mv a6, a3 +; DEFAULT-NEXT: mv a6, a2 ; DEFAULT-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; DEFAULT-NEXT: .LBB4_3: # %vector.body ; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 ; DEFAULT-NEXT: vl1re32.v v8, (a5) +; DEFAULT-NEXT: sub a6, a6, a3 ; DEFAULT-NEXT: vfadd.vf v8, v8, fa0 ; DEFAULT-NEXT: vs1r.v v8, (a5) -; DEFAULT-NEXT: sub a6, a6, a2 ; DEFAULT-NEXT: add a5, a5, a1 ; DEFAULT-NEXT: bnez a6, .LBB4_3 ; DEFAULT-NEXT: # %bb.4: # %middle.block ; DEFAULT-NEXT: beqz a4, .LBB4_7 ; DEFAULT-NEXT: .LBB4_5: # %for.body.preheader -; DEFAULT-NEXT: slli a1, a3, 2 -; DEFAULT-NEXT: add a1, a0, a1 +; DEFAULT-NEXT: slli a1, a2, 2 ; DEFAULT-NEXT: lui a2, 1 +; DEFAULT-NEXT: add a1, a0, a1 ; DEFAULT-NEXT: add a0, a0, a2 ; DEFAULT-NEXT: .LBB4_6: # %for.body ; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll index 92b88054a1d3b..5b82b27a51510 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll @@ -18,24 +18,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV32-NEXT: lw a2, 12(a2) ; RV32-NEXT: snez t2, a3 ; RV32-NEXT: sltiu t3, a3, 3 -; RV32-NEXT: xori t3, t3, 1 ; RV32-NEXT: sltiu t4, a3, 4 -; RV32-NEXT: xori t4, t4, 1 ; RV32-NEXT: sltiu a3, a3, 2 +; RV32-NEXT: xori t3, t3, 1 +; RV32-NEXT: xori t4, t4, 1 ; RV32-NEXT: xori a3, a3, 1 ; RV32-NEXT: and a3, a3, t0 ; RV32-NEXT: and a2, t4, a2 ; RV32-NEXT: and t0, t3, t1 ; RV32-NEXT: and a7, t2, a7 ; RV32-NEXT: neg a7, a7 -; RV32-NEXT: and a4, a7, a4 -; RV32-NEXT: neg a7, t0 -; RV32-NEXT: and a6, a7, a6 +; RV32-NEXT: neg t0, t0 ; RV32-NEXT: neg a2, a2 +; RV32-NEXT: neg a3, a3 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a6, t0, a6 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: neg a2, a3 -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: and a3, a3, a5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a4, a4, a6 ; RV32-NEXT: add a1, a4, a1 ; RV32-NEXT: add a0, a1, a0 @@ -54,24 +54,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV64-NEXT: sext.w a3, a3 ; RV64-NEXT: snez t2, a3 ; RV64-NEXT: sltiu t3, a3, 3 -; RV64-NEXT: xori t3, t3, 1 ; RV64-NEXT: sltiu t4, a3, 4 -; RV64-NEXT: xori t4, t4, 1 ; RV64-NEXT: sltiu a3, a3, 2 +; RV64-NEXT: xori t3, t3, 1 +; RV64-NEXT: xori t4, t4, 1 ; RV64-NEXT: xori a3, a3, 1 ; RV64-NEXT: and a3, a3, t0 ; RV64-NEXT: and a2, t4, a2 ; RV64-NEXT: and t0, t3, t1 ; RV64-NEXT: and a7, t2, a7 ; RV64-NEXT: negw a7, a7 -; RV64-NEXT: and a4, a7, a4 -; RV64-NEXT: negw a7, t0 -; RV64-NEXT: and a6, a7, a6 +; RV64-NEXT: negw t0, t0 ; RV64-NEXT: negw a2, a2 +; RV64-NEXT: negw a3, a3 +; RV64-NEXT: and a4, a7, a4 +; RV64-NEXT: and a6, t0, a6 ; RV64-NEXT: and a1, a2, a1 -; RV64-NEXT: negw a2, a3 -; RV64-NEXT: and a2, a2, a5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: and a3, a3, a5 +; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: add a4, a4, a6 ; RV64-NEXT: add a1, a4, a1 ; RV64-NEXT: addw a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll index fac033e982e10..f1fcaed2762ae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll @@ -229,38 +229,40 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vmv1r.v v7, v8 ; CHECK-RV32-NEXT: li a2, 128 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV32-NEXT: li a3, 32 +; CHECK-RV32-NEXT: vmv.x.s a4, v0 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV32-NEXT: vle8.v v8, (a1) +; CHECK-RV32-NEXT: vle8.v v16, (a1) ; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: slli a1, a1, 3 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1 -; CHECK-RV32-NEXT: li a1, 32 -; CHECK-RV32-NEXT: vsrl.vx v10, v9, a1 -; CHECK-RV32-NEXT: vmv.x.s a3, v10 -; CHECK-RV32-NEXT: vsrl.vx v10, v0, a1 -; CHECK-RV32-NEXT: vmv.x.s a1, v10 -; CHECK-RV32-NEXT: vmv.x.s a4, v9 -; CHECK-RV32-NEXT: vmv.x.s a5, v0 +; CHECK-RV32-NEXT: vsrl.vx v10, v9, a3 +; CHECK-RV32-NEXT: vsrl.vx v11, v0, a3 +; CHECK-RV32-NEXT: vmv.x.s a1, v9 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV32-NEXT: vcpop.m a6, v0 -; CHECK-RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma +; CHECK-RV32-NEXT: vcpop.m a3, v0 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a5, v10 +; CHECK-RV32-NEXT: vmv.x.s a6, v11 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-RV32-NEXT: vle8.v v8, (a0) -; CHECK-RV32-NEXT: csrr a6, vlenb -; CHECK-RV32-NEXT: slli a6, a6, 4 -; CHECK-RV32-NEXT: add a6, sp, a6 -; CHECK-RV32-NEXT: addi a6, a6, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: slli a3, a3, 4 +; CHECK-RV32-NEXT: add a3, sp, a3 +; CHECK-RV32-NEXT: addi a3, a3, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: cpop a1, a1 +; CHECK-RV32-NEXT: cpop a3, a6 ; CHECK-RV32-NEXT: cpop a5, a5 -; CHECK-RV32-NEXT: add a1, a5, a1 -; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: cpop a4, a4 ; CHECK-RV32-NEXT: add a3, a4, a3 -; CHECK-RV32-NEXT: add a1, a1, a3 +; CHECK-RV32-NEXT: add a1, a1, a5 +; CHECK-RV32-NEXT: add a1, a3, a1 ; CHECK-RV32-NEXT: add a0, a0, a1 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v7 @@ -269,19 +271,19 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v16, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: viota.m v24, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a1, 24 ; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a1, 24 ; CHECK-RV32-NEXT: mul a0, a0, a1 @@ -338,23 +340,27 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vmv1r.v v7, v8 ; CHECK-RV64-NEXT: li a2, 128 -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v8, (a1) -; CHECK-RV64-NEXT: addi a1, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v9, v0, 1 -; CHECK-RV64-NEXT: vmv.x.s a1, v9 ; CHECK-RV64-NEXT: vmv.x.s a3, v0 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v16, (a1) +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: slli a1, a1, 3 +; CHECK-RV64-NEXT: add a1, sp, a1 +; CHECK-RV64-NEXT: addi a1, a1, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v9 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a4, v0 ; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v24, (a0) +; CHECK-RV64-NEXT: vle8.v v8, (a0) ; CHECK-RV64-NEXT: csrr a4, vlenb ; CHECK-RV64-NEXT: slli a4, a4, 4 ; CHECK-RV64-NEXT: add a4, sp, a4 ; CHECK-RV64-NEXT: addi a4, a4, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a4, v7 ; CHECK-RV64-NEXT: cpop a3, a3 @@ -363,25 +369,22 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: add a0, a0, a1 ; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma ; CHECK-RV64-NEXT: vle8.v v8, (a0) -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v16, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 @@ -399,16 +402,15 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t -; CHECK-RV64-NEXT: vmv.v.v v16, v8 +; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 @@ -430,23 +432,23 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth ; CHECK-RV32-LABEL: test_expandload_v256i8_all_ones: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: li a1, 128 +; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-RV32-NEXT: vmset.m v8 -; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v9, v8, a2 -; CHECK-RV32-NEXT: vmv.x.s a3, v9 -; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: vmv.x.s a4, v8 -; CHECK-RV32-NEXT: cpop a4, a4 -; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: vmv.x.s a3, v8 ; CHECK-RV32-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-RV32-NEXT: vmv.x.s a4, v9 +; CHECK-RV32-NEXT: cpop a3, a3 ; CHECK-RV32-NEXT: vsrl.vx v9, v8, a2 -; CHECK-RV32-NEXT: vmv.x.s a2, v9 +; CHECK-RV32-NEXT: vmv.x.s a2, v8 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a3, a3, a4 +; CHECK-RV32-NEXT: vmv.x.s a4, v9 ; CHECK-RV32-NEXT: cpop a2, a2 -; CHECK-RV32-NEXT: vmv.x.s a4, v8 ; CHECK-RV32-NEXT: cpop a4, a4 -; CHECK-RV32-NEXT: add a2, a4, a2 +; CHECK-RV32-NEXT: add a2, a2, a4 ; CHECK-RV32-NEXT: add a3, a0, a3 ; CHECK-RV32-NEXT: add a2, a3, a2 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma @@ -462,8 +464,8 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth ; CHECK-RV64-NEXT: vmset.m v16 ; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vmv.x.s a2, v16 -; CHECK-RV64-NEXT: cpop a2, a2 ; CHECK-RV64-NEXT: vslidedown.vi v16, v16, 1 +; CHECK-RV64-NEXT: cpop a2, a2 ; CHECK-RV64-NEXT: vmv.x.s a3, v16 ; CHECK-RV64-NEXT: cpop a3, a3 ; CHECK-RV64-NEXT: add a0, a0, a2 @@ -662,107 +664,76 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: li a2, 40 -; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: slli a1, a1, 5 ; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: li a2, 24 ; CHECK-RV32-NEXT: mul a1, a1, a2 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 -; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 64 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v0 -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v8, (a0) -; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: slli a2, a2, 4 -; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: addi a2, a2, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV32-NEXT: li a2, 32 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v7 -; CHECK-RV32-NEXT: li a3, 32 +; CHECK-RV32-NEXT: vcpop.m a4, v0 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsrl.vx v25, v0, a3 -; CHECK-RV32-NEXT: vmv.x.s a3, v25 -; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: vmv.x.s a4, v0 +; CHECK-RV32-NEXT: vsrl.vx v25, v0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v7 +; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v16, (a0) +; CHECK-RV32-NEXT: csrr a5, vlenb +; CHECK-RV32-NEXT: slli a5, a5, 4 +; CHECK-RV32-NEXT: add a5, sp, a5 +; CHECK-RV32-NEXT: addi a5, a5, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a4, v25 ; CHECK-RV32-NEXT: cpop a4, a4 -; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: add a3, a3, a4 ; CHECK-RV32-NEXT: slli a3, a3, 1 ; CHECK-RV32-NEXT: add a0, a0, a3 ; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v8, (a0) +; CHECK-RV32-NEXT: vle16.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v8, v0 -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v16, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v8, v7 ; CHECK-RV32-NEXT: vmv1r.v v0, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: vmv.v.v v16, v8 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 5 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 40 -; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -774,93 +745,69 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: li a2, 40 -; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: slli a1, a1, 5 ; CHECK-RV64-NEXT: sub sp, sp, a1 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 64 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV64-NEXT: vcpop.m a2, v0 -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-RV64-NEXT: vle16.v v16, (a0) -; CHECK-RV64-NEXT: csrr a2, vlenb -; CHECK-RV64-NEXT: li a3, 24 -; CHECK-RV64-NEXT: mul a2, a2, a3 -; CHECK-RV64-NEXT: add a2, sp, a2 -; CHECK-RV64-NEXT: addi a2, a2, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v0 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV64-NEXT: vcpop.m a2, v7 -; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.x.s a3, v0 -; CHECK-RV64-NEXT: cpop a3, a3 -; CHECK-RV64-NEXT: slli a3, a3, 1 -; CHECK-RV64-NEXT: add a0, a0, a3 -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v0 +; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v24, (a0) +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: li a4, 24 +; CHECK-RV64-NEXT: mul a3, a3, a4 +; CHECK-RV64-NEXT: add a3, sp, a3 +; CHECK-RV64-NEXT: addi a3, a3, 16 +; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v7 +; CHECK-RV64-NEXT: cpop a2, a2 +; CHECK-RV64-NEXT: slli a2, a2, 1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; CHECK-RV64-NEXT: vle16.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v16, v0 -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v8, v7 ; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 40 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 5 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -874,17 +821,17 @@ define <128 x i16> @test_expandload_v128i16_all_ones(ptr %base, <128 x i16> %pas ; CHECK-RV32-LABEL: test_expandload_v128i16_all_ones: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-RV32-NEXT: vle16.v v8, (a0) ; CHECK-RV32-NEXT: vmset.m v16 -; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v17, v16, a2 -; CHECK-RV32-NEXT: vmv.x.s a2, v17 -; CHECK-RV32-NEXT: cpop a2, a2 -; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: vmv.x.s a3, v17 ; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: add a2, a3, a2 +; CHECK-RV32-NEXT: cpop a2, a2 +; CHECK-RV32-NEXT: add a2, a2, a3 ; CHECK-RV32-NEXT: slli a2, a2, 1 ; CHECK-RV32-NEXT: add a0, a0, a2 ; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma @@ -1069,92 +1016,69 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: li a2, 40 -; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: slli a1, a1, 5 ; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 32 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v0 -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-RV32-NEXT: vle32.v v16, (a0) -; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: li a3, 24 -; CHECK-RV32-NEXT: mul a2, a2, a3 -; CHECK-RV32-NEXT: add a2, sp, a2 -; CHECK-RV32-NEXT: addi a2, a2, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v7 -; CHECK-RV32-NEXT: vmv.x.s a3, v0 -; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: slli a3, a3, 2 -; CHECK-RV32-NEXT: add a0, a0, a3 -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v0 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vcpop.m a3, v0 +; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-RV32-NEXT: vle32.v v24, (a0) +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: li a4, 24 +; CHECK-RV32-NEXT: mul a3, a3, a4 +; CHECK-RV32-NEXT: add a3, sp, a3 +; CHECK-RV32-NEXT: addi a3, a3, 16 +; CHECK-RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vcpop.m a3, v7 +; CHECK-RV32-NEXT: cpop a2, a2 +; CHECK-RV32-NEXT: slli a2, a2, 2 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-RV32-NEXT: vle32.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v16, v0 -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v24, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a1, 24 ; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v8, v7 ; CHECK-RV32-NEXT: vmv1r.v v0, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 40 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 5 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -1166,92 +1090,69 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: li a2, 40 -; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: slli a1, a1, 5 ; CHECK-RV64-NEXT: sub sp, sp, a1 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 32 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV64-NEXT: vcpop.m a2, v0 -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-RV64-NEXT: vle32.v v16, (a0) -; CHECK-RV64-NEXT: csrr a2, vlenb -; CHECK-RV64-NEXT: li a3, 24 -; CHECK-RV64-NEXT: mul a2, a2, a3 -; CHECK-RV64-NEXT: add a2, sp, a2 -; CHECK-RV64-NEXT: addi a2, a2, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-RV64-NEXT: vcpop.m a2, v7 -; CHECK-RV64-NEXT: vmv.x.s a3, v0 -; CHECK-RV64-NEXT: cpopw a3, a3 -; CHECK-RV64-NEXT: slli a3, a3, 2 -; CHECK-RV64-NEXT: add a0, a0, a3 -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v0 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v0 +; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v24, (a0) +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: li a4, 24 +; CHECK-RV64-NEXT: mul a3, a3, a4 +; CHECK-RV64-NEXT: add a3, sp, a3 +; CHECK-RV64-NEXT: addi a3, a3, 16 +; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v7 +; CHECK-RV64-NEXT: cpopw a2, a2 +; CHECK-RV64-NEXT: slli a2, a2, 2 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-RV64-NEXT: vle32.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v16, v0 -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v8, v7 ; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 40 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 5 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -1420,92 +1321,68 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: li a2, 40 -; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: slli a1, a1, 5 ; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-RV32-NEXT: vle64.v v16, (a0) +; CHECK-RV32-NEXT: vle64.v v24, (a0) ; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: li a2, 24 ; CHECK-RV32-NEXT: mul a1, a1, a2 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vmv.x.s a1, v0 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-RV32-NEXT: zext.h a1, a1 ; CHECK-RV32-NEXT: cpop a1, a1 ; CHECK-RV32-NEXT: slli a1, a1, 3 ; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v7 ; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV32-NEXT: vle64.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v16, v0 -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v24, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a1, 24 ; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v8, v7 ; CHECK-RV32-NEXT: vmv1r.v v0, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 40 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 5 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -1517,92 +1394,68 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: li a2, 40 -; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: slli a1, a1, 5 ; CHECK-RV64-NEXT: sub sp, sp, a1 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-RV64-NEXT: vcpop.m a1, v0 ; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-RV64-NEXT: vle64.v v16, (a0) +; CHECK-RV64-NEXT: vle64.v v24, (a0) ; CHECK-RV64-NEXT: csrr a1, vlenb ; CHECK-RV64-NEXT: li a2, 24 ; CHECK-RV64-NEXT: mul a1, a1, a2 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vmv.x.s a1, v0 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-RV64-NEXT: zext.h a1, a1 ; CHECK-RV64-NEXT: cpopw a1, a1 ; CHECK-RV64-NEXT: slli a1, a1, 3 ; CHECK-RV64-NEXT: add a0, a0, a1 -; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-RV64-NEXT: vcpop.m a1, v7 ; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV64-NEXT: vle64.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v16, v0 -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v8, v7 ; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 40 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 5 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -1765,26 +1618,26 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a1, .LBB61_30 ; CHECK-RV32-NEXT: .LBB61_29: # %cond.load109 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 29, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 28 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 28 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_30: # %else110 ; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: li a1, 32 ; CHECK-RV32-NEXT: bgez a2, .LBB61_32 ; CHECK-RV32-NEXT: # %bb.31: # %cond.load113 ; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 30, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a2 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 29 +; CHECK-RV32-NEXT: vmv.s.x v9, a2 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 29 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_32: # %else114 ; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -1792,10 +1645,10 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a2, .LBB61_34 ; CHECK-RV32-NEXT: # %bb.33: # %cond.load117 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v17, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v9, a2 ; CHECK-RV32-NEXT: vsetivli zero, 31, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vi v8, v17, 30 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 30 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -1926,13 +1779,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_65: # %cond.load241 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 62 ; CHECK-RV32-NEXT: li a4, 61 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -1943,12 +1796,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a3, .LBB61_68 ; CHECK-RV32-NEXT: # %bb.67: # %cond.load245 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v17, a3 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 63 ; CHECK-RV32-NEXT: li a4, 62 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v17, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2079,13 +1932,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_99: # %cond.load369 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 94 ; CHECK-RV32-NEXT: li a4, 93 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2096,12 +1949,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a2, .LBB61_102 ; CHECK-RV32-NEXT: # %bb.101: # %cond.load373 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 95 ; CHECK-RV32-NEXT: li a4, 94 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2232,13 +2085,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_133: # %cond.load497 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 126 ; CHECK-RV32-NEXT: li a4, 125 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -2249,12 +2102,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a3, .LBB61_136 ; CHECK-RV32-NEXT: # %bb.135: # %cond.load501 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v18, a3 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 127 ; CHECK-RV32-NEXT: li a4, 126 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2385,13 +2238,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_167: # %cond.load625 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 158 ; CHECK-RV32-NEXT: li a4, 157 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2402,12 +2255,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a2, .LBB61_170 ; CHECK-RV32-NEXT: # %bb.169: # %cond.load629 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 159 ; CHECK-RV32-NEXT: li a4, 158 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2538,16 +2391,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_201: # %cond.load753 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 190 ; CHECK-RV32-NEXT: li a4, 189 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_202: # %else754 ; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2555,12 +2408,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a3, .LBB61_204 ; CHECK-RV32-NEXT: # %bb.203: # %cond.load757 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a3 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 191 ; CHECK-RV32-NEXT: li a4, 190 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2691,13 +2544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_235: # %cond.load881 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 222 ; CHECK-RV32-NEXT: li a4, 221 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2708,12 +2561,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a2, .LBB61_238 ; CHECK-RV32-NEXT: # %bb.237: # %cond.load885 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 223 ; CHECK-RV32-NEXT: li a4, 222 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -2844,16 +2697,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_269: # %cond.load1009 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 254 ; CHECK-RV32-NEXT: li a4, 253 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_270: # %else1010 ; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2861,12 +2714,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: bgez a3, .LBB61_272 ; CHECK-RV32-NEXT: # %bb.271: # %cond.load1013 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a3 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 255 ; CHECK-RV32-NEXT: li a4, 254 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4046,326 +3899,326 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_2 ; CHECK-RV32-NEXT: .LBB61_545: # %cond.load1 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 1 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 4 ; CHECK-RV32-NEXT: bnez a1, .LBB61_546 ; CHECK-RV32-NEXT: j .LBB61_3 ; CHECK-RV32-NEXT: .LBB61_546: # %cond.load5 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 2 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 2 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 8 ; CHECK-RV32-NEXT: bnez a1, .LBB61_547 ; CHECK-RV32-NEXT: j .LBB61_4 ; CHECK-RV32-NEXT: .LBB61_547: # %cond.load9 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 3 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 3 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 16 ; CHECK-RV32-NEXT: bnez a1, .LBB61_548 ; CHECK-RV32-NEXT: j .LBB61_5 ; CHECK-RV32-NEXT: .LBB61_548: # %cond.load13 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 5, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 4 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 32 ; CHECK-RV32-NEXT: bnez a1, .LBB61_549 ; CHECK-RV32-NEXT: j .LBB61_6 ; CHECK-RV32-NEXT: .LBB61_549: # %cond.load17 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 5 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 5 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 64 ; CHECK-RV32-NEXT: bnez a1, .LBB61_550 ; CHECK-RV32-NEXT: j .LBB61_7 ; CHECK-RV32-NEXT: .LBB61_550: # %cond.load21 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 7, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 6 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 6 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 128 ; CHECK-RV32-NEXT: bnez a1, .LBB61_551 ; CHECK-RV32-NEXT: j .LBB61_8 ; CHECK-RV32-NEXT: .LBB61_551: # %cond.load25 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 7 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 7 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 256 ; CHECK-RV32-NEXT: bnez a1, .LBB61_552 ; CHECK-RV32-NEXT: j .LBB61_9 ; CHECK-RV32-NEXT: .LBB61_552: # %cond.load29 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 9, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 8 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 8 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 512 ; CHECK-RV32-NEXT: bnez a1, .LBB61_553 ; CHECK-RV32-NEXT: j .LBB61_10 ; CHECK-RV32-NEXT: .LBB61_553: # %cond.load33 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 9 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 9 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 1024 ; CHECK-RV32-NEXT: bnez a1, .LBB61_554 ; CHECK-RV32-NEXT: j .LBB61_11 ; CHECK-RV32-NEXT: .LBB61_554: # %cond.load37 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 11, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 10 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 10 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 20 ; CHECK-RV32-NEXT: bltz a1, .LBB61_555 ; CHECK-RV32-NEXT: j .LBB61_12 ; CHECK-RV32-NEXT: .LBB61_555: # %cond.load41 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 11 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 11 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 19 ; CHECK-RV32-NEXT: bltz a1, .LBB61_556 ; CHECK-RV32-NEXT: j .LBB61_13 ; CHECK-RV32-NEXT: .LBB61_556: # %cond.load45 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 13, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 12 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 12 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 18 ; CHECK-RV32-NEXT: bltz a1, .LBB61_557 ; CHECK-RV32-NEXT: j .LBB61_14 ; CHECK-RV32-NEXT: .LBB61_557: # %cond.load49 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 13 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 13 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 17 ; CHECK-RV32-NEXT: bltz a1, .LBB61_558 ; CHECK-RV32-NEXT: j .LBB61_15 ; CHECK-RV32-NEXT: .LBB61_558: # %cond.load53 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 15, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 14 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 14 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 16 ; CHECK-RV32-NEXT: bltz a1, .LBB61_559 ; CHECK-RV32-NEXT: j .LBB61_16 ; CHECK-RV32-NEXT: .LBB61_559: # %cond.load57 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 15 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 15 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 15 ; CHECK-RV32-NEXT: bltz a1, .LBB61_560 ; CHECK-RV32-NEXT: j .LBB61_17 ; CHECK-RV32-NEXT: .LBB61_560: # %cond.load61 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 17, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 16 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 16 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 14 ; CHECK-RV32-NEXT: bltz a1, .LBB61_561 ; CHECK-RV32-NEXT: j .LBB61_18 ; CHECK-RV32-NEXT: .LBB61_561: # %cond.load65 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 18, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 17 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 17 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 13 ; CHECK-RV32-NEXT: bltz a1, .LBB61_562 ; CHECK-RV32-NEXT: j .LBB61_19 ; CHECK-RV32-NEXT: .LBB61_562: # %cond.load69 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 19, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 18 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 18 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 12 ; CHECK-RV32-NEXT: bltz a1, .LBB61_563 ; CHECK-RV32-NEXT: j .LBB61_20 ; CHECK-RV32-NEXT: .LBB61_563: # %cond.load73 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 20, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 19 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 19 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 11 ; CHECK-RV32-NEXT: bltz a1, .LBB61_564 ; CHECK-RV32-NEXT: j .LBB61_21 ; CHECK-RV32-NEXT: .LBB61_564: # %cond.load77 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 21, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 20 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 20 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 10 ; CHECK-RV32-NEXT: bltz a1, .LBB61_565 ; CHECK-RV32-NEXT: j .LBB61_22 ; CHECK-RV32-NEXT: .LBB61_565: # %cond.load81 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 22, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 21 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 21 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 9 ; CHECK-RV32-NEXT: bltz a1, .LBB61_566 ; CHECK-RV32-NEXT: j .LBB61_23 ; CHECK-RV32-NEXT: .LBB61_566: # %cond.load85 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 23, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 22 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 22 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 8 ; CHECK-RV32-NEXT: bltz a1, .LBB61_567 ; CHECK-RV32-NEXT: j .LBB61_24 ; CHECK-RV32-NEXT: .LBB61_567: # %cond.load89 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 24, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 23 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 23 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 7 ; CHECK-RV32-NEXT: bltz a1, .LBB61_568 ; CHECK-RV32-NEXT: j .LBB61_25 ; CHECK-RV32-NEXT: .LBB61_568: # %cond.load93 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 25, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 24 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 24 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 6 ; CHECK-RV32-NEXT: bltz a1, .LBB61_569 ; CHECK-RV32-NEXT: j .LBB61_26 ; CHECK-RV32-NEXT: .LBB61_569: # %cond.load97 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 26, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 25 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 25 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 5 ; CHECK-RV32-NEXT: bltz a1, .LBB61_570 ; CHECK-RV32-NEXT: j .LBB61_27 ; CHECK-RV32-NEXT: .LBB61_570: # %cond.load101 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 27, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 26 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 26 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 4 ; CHECK-RV32-NEXT: bltz a1, .LBB61_571 ; CHECK-RV32-NEXT: j .LBB61_28 ; CHECK-RV32-NEXT: .LBB61_571: # %cond.load105 ; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetivli zero, 28, e8, m1, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a1 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 -; CHECK-RV32-NEXT: vslideup.vi v8, v16, 27 +; CHECK-RV32-NEXT: vmv.s.x v9, a1 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 27 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 3 ; CHECK-RV32-NEXT: bgez a1, .LBB61_1025 ; CHECK-RV32-NEXT: j .LBB61_29 @@ -4373,11 +4226,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_30 ; CHECK-RV32-NEXT: .LBB61_572: # %cond.load121 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 32 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vi v8, v24, 31 +; CHECK-RV32-NEXT: vslideup.vi v8, v9, 31 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4387,13 +4240,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_573: # %cond.load125 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 33 ; CHECK-RV32-NEXT: li a4, 32 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4403,13 +4256,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_574: # %cond.load129 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 34 ; CHECK-RV32-NEXT: li a4, 33 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4419,13 +4272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_575: # %cond.load133 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 35 ; CHECK-RV32-NEXT: li a4, 34 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4435,13 +4288,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_576: # %cond.load137 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 36 ; CHECK-RV32-NEXT: li a4, 35 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4451,13 +4304,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_577: # %cond.load141 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 37 ; CHECK-RV32-NEXT: li a4, 36 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4467,13 +4320,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_578: # %cond.load145 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 38 ; CHECK-RV32-NEXT: li a4, 37 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4483,13 +4336,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_579: # %cond.load149 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 39 ; CHECK-RV32-NEXT: li a4, 38 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4499,13 +4352,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_580: # %cond.load153 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 40 ; CHECK-RV32-NEXT: li a4, 39 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4515,13 +4368,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_581: # %cond.load157 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 41 ; CHECK-RV32-NEXT: li a4, 40 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4531,13 +4384,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_582: # %cond.load161 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 42 ; CHECK-RV32-NEXT: li a4, 41 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4547,13 +4400,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_583: # %cond.load165 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 43 ; CHECK-RV32-NEXT: li a4, 42 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4563,13 +4416,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_584: # %cond.load169 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 44 ; CHECK-RV32-NEXT: li a4, 43 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4579,13 +4432,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_585: # %cond.load173 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 45 ; CHECK-RV32-NEXT: li a4, 44 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4595,13 +4448,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_586: # %cond.load177 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 46 ; CHECK-RV32-NEXT: li a4, 45 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4611,13 +4464,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_587: # %cond.load181 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 47 ; CHECK-RV32-NEXT: li a4, 46 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4627,13 +4480,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_588: # %cond.load185 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 48 ; CHECK-RV32-NEXT: li a4, 47 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4643,13 +4496,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_589: # %cond.load189 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 49 ; CHECK-RV32-NEXT: li a4, 48 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4659,13 +4512,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_590: # %cond.load193 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 50 ; CHECK-RV32-NEXT: li a4, 49 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4675,13 +4528,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_591: # %cond.load197 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 51 ; CHECK-RV32-NEXT: li a4, 50 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4691,13 +4544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_592: # %cond.load201 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 52 ; CHECK-RV32-NEXT: li a4, 51 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4707,13 +4560,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_593: # %cond.load205 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 53 ; CHECK-RV32-NEXT: li a4, 52 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4723,13 +4576,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_594: # %cond.load209 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 54 ; CHECK-RV32-NEXT: li a4, 53 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4739,13 +4592,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_595: # %cond.load213 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 55 ; CHECK-RV32-NEXT: li a4, 54 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4755,13 +4608,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_596: # %cond.load217 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 56 ; CHECK-RV32-NEXT: li a4, 55 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4771,13 +4624,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_597: # %cond.load221 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 57 ; CHECK-RV32-NEXT: li a4, 56 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4787,13 +4640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_598: # %cond.load225 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 58 ; CHECK-RV32-NEXT: li a4, 57 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4803,13 +4656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_599: # %cond.load229 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 59 ; CHECK-RV32-NEXT: li a4, 58 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4819,13 +4672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_600: # %cond.load233 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 60 ; CHECK-RV32-NEXT: li a4, 59 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4835,13 +4688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_601: # %cond.load237 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v9, a3 ; CHECK-RV32-NEXT: li a3, 61 ; CHECK-RV32-NEXT: li a4, 60 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -4852,12 +4705,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_66 ; CHECK-RV32-NEXT: .LBB61_602: # %cond.load249 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v17, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v9, a2 ; CHECK-RV32-NEXT: li a2, 64 ; CHECK-RV32-NEXT: li a4, 63 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v17, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4867,13 +4720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_603: # %cond.load253 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 65 ; CHECK-RV32-NEXT: li a4, 64 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4883,13 +4736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_604: # %cond.load257 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 66 ; CHECK-RV32-NEXT: li a4, 65 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4899,13 +4752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_605: # %cond.load261 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 67 ; CHECK-RV32-NEXT: li a4, 66 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4915,13 +4768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_606: # %cond.load265 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 68 ; CHECK-RV32-NEXT: li a4, 67 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4931,13 +4784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_607: # %cond.load269 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 69 ; CHECK-RV32-NEXT: li a4, 68 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4947,13 +4800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_608: # %cond.load273 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 70 ; CHECK-RV32-NEXT: li a4, 69 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4963,13 +4816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_609: # %cond.load277 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 71 ; CHECK-RV32-NEXT: li a4, 70 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4979,13 +4832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_610: # %cond.load281 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 72 ; CHECK-RV32-NEXT: li a4, 71 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -4995,13 +4848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_611: # %cond.load285 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 73 ; CHECK-RV32-NEXT: li a4, 72 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5011,13 +4864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_612: # %cond.load289 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 74 ; CHECK-RV32-NEXT: li a4, 73 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5027,13 +4880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_613: # %cond.load293 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 75 ; CHECK-RV32-NEXT: li a4, 74 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5043,13 +4896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_614: # %cond.load297 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 76 ; CHECK-RV32-NEXT: li a4, 75 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5059,13 +4912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_615: # %cond.load301 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 77 ; CHECK-RV32-NEXT: li a4, 76 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5075,13 +4928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_616: # %cond.load305 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 78 ; CHECK-RV32-NEXT: li a4, 77 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5091,13 +4944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_617: # %cond.load309 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 79 ; CHECK-RV32-NEXT: li a4, 78 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5107,13 +4960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_618: # %cond.load313 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 80 ; CHECK-RV32-NEXT: li a4, 79 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5123,13 +4976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_619: # %cond.load317 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 81 ; CHECK-RV32-NEXT: li a4, 80 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5139,13 +4992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_620: # %cond.load321 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 82 ; CHECK-RV32-NEXT: li a4, 81 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5155,13 +5008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_621: # %cond.load325 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 83 ; CHECK-RV32-NEXT: li a4, 82 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5171,13 +5024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_622: # %cond.load329 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 84 ; CHECK-RV32-NEXT: li a4, 83 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5187,13 +5040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_623: # %cond.load333 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 85 ; CHECK-RV32-NEXT: li a4, 84 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5203,13 +5056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_624: # %cond.load337 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 86 ; CHECK-RV32-NEXT: li a4, 85 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5219,13 +5072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_625: # %cond.load341 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 87 ; CHECK-RV32-NEXT: li a4, 86 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5235,13 +5088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_626: # %cond.load345 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 88 ; CHECK-RV32-NEXT: li a4, 87 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5251,13 +5104,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_627: # %cond.load349 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 89 ; CHECK-RV32-NEXT: li a4, 88 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5267,13 +5120,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_628: # %cond.load353 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 90 ; CHECK-RV32-NEXT: li a4, 89 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5283,13 +5136,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_629: # %cond.load357 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 91 ; CHECK-RV32-NEXT: li a4, 90 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5299,13 +5152,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_630: # %cond.load361 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 92 ; CHECK-RV32-NEXT: li a4, 91 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5315,13 +5168,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_631: # %cond.load365 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 93 ; CHECK-RV32-NEXT: li a4, 92 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5332,12 +5185,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_100 ; CHECK-RV32-NEXT: .LBB61_632: # %cond.load377 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 96 ; CHECK-RV32-NEXT: li a4, 95 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5347,13 +5200,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_633: # %cond.load381 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 97 ; CHECK-RV32-NEXT: li a4, 96 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5363,13 +5216,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_634: # %cond.load385 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 98 ; CHECK-RV32-NEXT: li a4, 97 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5379,13 +5232,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_635: # %cond.load389 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 99 ; CHECK-RV32-NEXT: li a4, 98 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5395,13 +5248,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_636: # %cond.load393 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 100 ; CHECK-RV32-NEXT: li a4, 99 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5411,13 +5264,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_637: # %cond.load397 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 101 ; CHECK-RV32-NEXT: li a4, 100 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5427,13 +5280,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_638: # %cond.load401 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 102 ; CHECK-RV32-NEXT: li a4, 101 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5443,13 +5296,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_639: # %cond.load405 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 103 ; CHECK-RV32-NEXT: li a4, 102 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5459,13 +5312,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_640: # %cond.load409 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 104 ; CHECK-RV32-NEXT: li a4, 103 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5475,13 +5328,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_641: # %cond.load413 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 105 ; CHECK-RV32-NEXT: li a4, 104 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5491,13 +5344,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_642: # %cond.load417 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 106 ; CHECK-RV32-NEXT: li a4, 105 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5507,13 +5360,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_643: # %cond.load421 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 107 ; CHECK-RV32-NEXT: li a4, 106 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5523,13 +5376,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_644: # %cond.load425 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 108 ; CHECK-RV32-NEXT: li a4, 107 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5539,13 +5392,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_645: # %cond.load429 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 109 ; CHECK-RV32-NEXT: li a4, 108 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5555,13 +5408,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_646: # %cond.load433 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 110 ; CHECK-RV32-NEXT: li a4, 109 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5571,13 +5424,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_647: # %cond.load437 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 111 ; CHECK-RV32-NEXT: li a4, 110 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5587,13 +5440,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_648: # %cond.load441 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 112 ; CHECK-RV32-NEXT: li a4, 111 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5603,13 +5456,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_649: # %cond.load445 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 113 ; CHECK-RV32-NEXT: li a4, 112 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5619,13 +5472,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_650: # %cond.load449 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 114 ; CHECK-RV32-NEXT: li a4, 113 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5635,13 +5488,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_651: # %cond.load453 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 115 ; CHECK-RV32-NEXT: li a4, 114 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5651,13 +5504,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_652: # %cond.load457 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 116 ; CHECK-RV32-NEXT: li a4, 115 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5667,13 +5520,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_653: # %cond.load461 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 117 ; CHECK-RV32-NEXT: li a4, 116 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5683,13 +5536,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_654: # %cond.load465 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 118 ; CHECK-RV32-NEXT: li a4, 117 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5699,13 +5552,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_655: # %cond.load469 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 119 ; CHECK-RV32-NEXT: li a4, 118 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5715,13 +5568,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_656: # %cond.load473 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 120 ; CHECK-RV32-NEXT: li a4, 119 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5731,13 +5584,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_657: # %cond.load477 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 121 ; CHECK-RV32-NEXT: li a4, 120 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5747,13 +5600,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_658: # %cond.load481 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 122 ; CHECK-RV32-NEXT: li a4, 121 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5763,13 +5616,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_659: # %cond.load485 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 123 ; CHECK-RV32-NEXT: li a4, 122 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5779,13 +5632,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_660: # %cond.load489 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 124 ; CHECK-RV32-NEXT: li a4, 123 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5795,13 +5648,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_661: # %cond.load493 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v24, a3 ; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v10, a3 ; CHECK-RV32-NEXT: li a3, 125 ; CHECK-RV32-NEXT: li a4, 124 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 @@ -5812,12 +5665,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_134 ; CHECK-RV32-NEXT: .LBB61_662: # %cond.load505 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v18, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v10, a2 ; CHECK-RV32-NEXT: li a2, 128 ; CHECK-RV32-NEXT: li a4, 127 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5827,13 +5680,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_663: # %cond.load509 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 129 ; CHECK-RV32-NEXT: li a4, 128 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5843,13 +5696,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_664: # %cond.load513 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 130 ; CHECK-RV32-NEXT: li a4, 129 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5859,13 +5712,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_665: # %cond.load517 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 131 ; CHECK-RV32-NEXT: li a4, 130 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5875,13 +5728,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_666: # %cond.load521 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 132 ; CHECK-RV32-NEXT: li a4, 131 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5891,13 +5744,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_667: # %cond.load525 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 133 ; CHECK-RV32-NEXT: li a4, 132 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5907,13 +5760,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_668: # %cond.load529 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 134 ; CHECK-RV32-NEXT: li a4, 133 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5923,13 +5776,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_669: # %cond.load533 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 135 ; CHECK-RV32-NEXT: li a4, 134 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5939,13 +5792,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_670: # %cond.load537 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 136 ; CHECK-RV32-NEXT: li a4, 135 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5955,13 +5808,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_671: # %cond.load541 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 137 ; CHECK-RV32-NEXT: li a4, 136 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5971,13 +5824,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_672: # %cond.load545 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 138 ; CHECK-RV32-NEXT: li a4, 137 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -5987,13 +5840,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_673: # %cond.load549 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 139 ; CHECK-RV32-NEXT: li a4, 138 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6003,13 +5856,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_674: # %cond.load553 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 140 ; CHECK-RV32-NEXT: li a4, 139 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6019,13 +5872,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_675: # %cond.load557 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 141 ; CHECK-RV32-NEXT: li a4, 140 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6035,13 +5888,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_676: # %cond.load561 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 142 ; CHECK-RV32-NEXT: li a4, 141 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6051,13 +5904,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_677: # %cond.load565 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 143 ; CHECK-RV32-NEXT: li a4, 142 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6067,13 +5920,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_678: # %cond.load569 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 144 ; CHECK-RV32-NEXT: li a4, 143 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6083,13 +5936,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_679: # %cond.load573 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 145 ; CHECK-RV32-NEXT: li a4, 144 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6099,13 +5952,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_680: # %cond.load577 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 146 ; CHECK-RV32-NEXT: li a4, 145 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6115,13 +5968,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_681: # %cond.load581 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 147 ; CHECK-RV32-NEXT: li a4, 146 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6131,13 +5984,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_682: # %cond.load585 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 148 ; CHECK-RV32-NEXT: li a4, 147 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6147,13 +6000,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_683: # %cond.load589 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 149 ; CHECK-RV32-NEXT: li a4, 148 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6163,13 +6016,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_684: # %cond.load593 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 150 ; CHECK-RV32-NEXT: li a4, 149 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6179,13 +6032,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_685: # %cond.load597 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 151 ; CHECK-RV32-NEXT: li a4, 150 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6195,13 +6048,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_686: # %cond.load601 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 152 ; CHECK-RV32-NEXT: li a4, 151 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6211,13 +6064,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_687: # %cond.load605 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 153 ; CHECK-RV32-NEXT: li a4, 152 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6227,13 +6080,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_688: # %cond.load609 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 154 ; CHECK-RV32-NEXT: li a4, 153 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6243,13 +6096,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_689: # %cond.load613 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 155 ; CHECK-RV32-NEXT: li a4, 154 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6259,13 +6112,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_690: # %cond.load617 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 156 ; CHECK-RV32-NEXT: li a4, 155 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6275,13 +6128,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_691: # %cond.load621 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 157 ; CHECK-RV32-NEXT: li a4, 156 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6292,479 +6145,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_168 ; CHECK-RV32-NEXT: .LBB61_692: # %cond.load633 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 160 ; CHECK-RV32-NEXT: li a4, 159 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_693 ; CHECK-RV32-NEXT: j .LBB61_172 ; CHECK-RV32-NEXT: .LBB61_693: # %cond.load637 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 161 ; CHECK-RV32-NEXT: li a4, 160 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_694 ; CHECK-RV32-NEXT: j .LBB61_173 ; CHECK-RV32-NEXT: .LBB61_694: # %cond.load641 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 162 ; CHECK-RV32-NEXT: li a4, 161 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_695 ; CHECK-RV32-NEXT: j .LBB61_174 ; CHECK-RV32-NEXT: .LBB61_695: # %cond.load645 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 163 ; CHECK-RV32-NEXT: li a4, 162 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_696 ; CHECK-RV32-NEXT: j .LBB61_175 ; CHECK-RV32-NEXT: .LBB61_696: # %cond.load649 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 164 ; CHECK-RV32-NEXT: li a4, 163 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_697 ; CHECK-RV32-NEXT: j .LBB61_176 ; CHECK-RV32-NEXT: .LBB61_697: # %cond.load653 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 165 ; CHECK-RV32-NEXT: li a4, 164 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_698 ; CHECK-RV32-NEXT: j .LBB61_177 ; CHECK-RV32-NEXT: .LBB61_698: # %cond.load657 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 166 ; CHECK-RV32-NEXT: li a4, 165 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_699 ; CHECK-RV32-NEXT: j .LBB61_178 ; CHECK-RV32-NEXT: .LBB61_699: # %cond.load661 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 167 ; CHECK-RV32-NEXT: li a4, 166 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_700 ; CHECK-RV32-NEXT: j .LBB61_179 ; CHECK-RV32-NEXT: .LBB61_700: # %cond.load665 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 168 ; CHECK-RV32-NEXT: li a4, 167 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_701 ; CHECK-RV32-NEXT: j .LBB61_180 ; CHECK-RV32-NEXT: .LBB61_701: # %cond.load669 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 169 ; CHECK-RV32-NEXT: li a4, 168 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_702 ; CHECK-RV32-NEXT: j .LBB61_181 ; CHECK-RV32-NEXT: .LBB61_702: # %cond.load673 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 170 ; CHECK-RV32-NEXT: li a4, 169 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_703 ; CHECK-RV32-NEXT: j .LBB61_182 ; CHECK-RV32-NEXT: .LBB61_703: # %cond.load677 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 171 ; CHECK-RV32-NEXT: li a4, 170 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_704 ; CHECK-RV32-NEXT: j .LBB61_183 ; CHECK-RV32-NEXT: .LBB61_704: # %cond.load681 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 172 ; CHECK-RV32-NEXT: li a4, 171 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_705 ; CHECK-RV32-NEXT: j .LBB61_184 ; CHECK-RV32-NEXT: .LBB61_705: # %cond.load685 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 173 ; CHECK-RV32-NEXT: li a4, 172 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_706 ; CHECK-RV32-NEXT: j .LBB61_185 ; CHECK-RV32-NEXT: .LBB61_706: # %cond.load689 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 174 ; CHECK-RV32-NEXT: li a4, 173 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_707 ; CHECK-RV32-NEXT: j .LBB61_186 ; CHECK-RV32-NEXT: .LBB61_707: # %cond.load693 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 175 ; CHECK-RV32-NEXT: li a4, 174 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_708 ; CHECK-RV32-NEXT: j .LBB61_187 ; CHECK-RV32-NEXT: .LBB61_708: # %cond.load697 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 176 ; CHECK-RV32-NEXT: li a4, 175 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_709 ; CHECK-RV32-NEXT: j .LBB61_188 ; CHECK-RV32-NEXT: .LBB61_709: # %cond.load701 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 177 ; CHECK-RV32-NEXT: li a4, 176 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_710 ; CHECK-RV32-NEXT: j .LBB61_189 ; CHECK-RV32-NEXT: .LBB61_710: # %cond.load705 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 178 ; CHECK-RV32-NEXT: li a4, 177 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_711 ; CHECK-RV32-NEXT: j .LBB61_190 ; CHECK-RV32-NEXT: .LBB61_711: # %cond.load709 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 179 ; CHECK-RV32-NEXT: li a4, 178 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_712 ; CHECK-RV32-NEXT: j .LBB61_191 ; CHECK-RV32-NEXT: .LBB61_712: # %cond.load713 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 180 ; CHECK-RV32-NEXT: li a4, 179 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_713 ; CHECK-RV32-NEXT: j .LBB61_192 ; CHECK-RV32-NEXT: .LBB61_713: # %cond.load717 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 181 ; CHECK-RV32-NEXT: li a4, 180 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_714 ; CHECK-RV32-NEXT: j .LBB61_193 ; CHECK-RV32-NEXT: .LBB61_714: # %cond.load721 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 182 ; CHECK-RV32-NEXT: li a4, 181 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_715 ; CHECK-RV32-NEXT: j .LBB61_194 ; CHECK-RV32-NEXT: .LBB61_715: # %cond.load725 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 183 ; CHECK-RV32-NEXT: li a4, 182 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_716 ; CHECK-RV32-NEXT: j .LBB61_195 ; CHECK-RV32-NEXT: .LBB61_716: # %cond.load729 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 184 ; CHECK-RV32-NEXT: li a4, 183 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_717 ; CHECK-RV32-NEXT: j .LBB61_196 ; CHECK-RV32-NEXT: .LBB61_717: # %cond.load733 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 185 ; CHECK-RV32-NEXT: li a4, 184 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_718 ; CHECK-RV32-NEXT: j .LBB61_197 ; CHECK-RV32-NEXT: .LBB61_718: # %cond.load737 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 186 ; CHECK-RV32-NEXT: li a4, 185 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_719 ; CHECK-RV32-NEXT: j .LBB61_198 ; CHECK-RV32-NEXT: .LBB61_719: # %cond.load741 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 187 ; CHECK-RV32-NEXT: li a4, 186 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_720 ; CHECK-RV32-NEXT: j .LBB61_199 ; CHECK-RV32-NEXT: .LBB61_720: # %cond.load745 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 188 ; CHECK-RV32-NEXT: li a4, 187 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_721 ; CHECK-RV32-NEXT: j .LBB61_200 ; CHECK-RV32-NEXT: .LBB61_721: # %cond.load749 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 189 ; CHECK-RV32-NEXT: li a4, 188 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1030 ; CHECK-RV32-NEXT: j .LBB61_201 @@ -6772,12 +6625,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_202 ; CHECK-RV32-NEXT: .LBB61_722: # %cond.load761 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 192 ; CHECK-RV32-NEXT: li a4, 191 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6787,13 +6640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_723: # %cond.load765 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 193 ; CHECK-RV32-NEXT: li a4, 192 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6803,13 +6656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_724: # %cond.load769 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 194 ; CHECK-RV32-NEXT: li a4, 193 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6819,13 +6672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_725: # %cond.load773 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 195 ; CHECK-RV32-NEXT: li a4, 194 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6835,13 +6688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_726: # %cond.load777 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 196 ; CHECK-RV32-NEXT: li a4, 195 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6851,13 +6704,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_727: # %cond.load781 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 197 ; CHECK-RV32-NEXT: li a4, 196 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6867,13 +6720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_728: # %cond.load785 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 198 ; CHECK-RV32-NEXT: li a4, 197 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6883,13 +6736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_729: # %cond.load789 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 199 ; CHECK-RV32-NEXT: li a4, 198 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6899,13 +6752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_730: # %cond.load793 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 200 ; CHECK-RV32-NEXT: li a4, 199 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6915,13 +6768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_731: # %cond.load797 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 201 ; CHECK-RV32-NEXT: li a4, 200 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6931,13 +6784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_732: # %cond.load801 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 202 ; CHECK-RV32-NEXT: li a4, 201 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6947,13 +6800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_733: # %cond.load805 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 203 ; CHECK-RV32-NEXT: li a4, 202 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6963,13 +6816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_734: # %cond.load809 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 204 ; CHECK-RV32-NEXT: li a4, 203 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6979,13 +6832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_735: # %cond.load813 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 205 ; CHECK-RV32-NEXT: li a4, 204 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -6995,13 +6848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_736: # %cond.load817 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 206 ; CHECK-RV32-NEXT: li a4, 205 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7011,13 +6864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_737: # %cond.load821 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 207 ; CHECK-RV32-NEXT: li a4, 206 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7027,13 +6880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_738: # %cond.load825 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 208 ; CHECK-RV32-NEXT: li a4, 207 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7043,13 +6896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_739: # %cond.load829 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 209 ; CHECK-RV32-NEXT: li a4, 208 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7059,13 +6912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_740: # %cond.load833 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 210 ; CHECK-RV32-NEXT: li a4, 209 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7075,13 +6928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_741: # %cond.load837 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 211 ; CHECK-RV32-NEXT: li a4, 210 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7091,13 +6944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_742: # %cond.load841 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 212 ; CHECK-RV32-NEXT: li a4, 211 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7107,13 +6960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_743: # %cond.load845 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 213 ; CHECK-RV32-NEXT: li a4, 212 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7123,13 +6976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_744: # %cond.load849 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 214 ; CHECK-RV32-NEXT: li a4, 213 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7139,13 +6992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_745: # %cond.load853 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 215 ; CHECK-RV32-NEXT: li a4, 214 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7155,13 +7008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_746: # %cond.load857 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 216 ; CHECK-RV32-NEXT: li a4, 215 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7171,13 +7024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_747: # %cond.load861 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 217 ; CHECK-RV32-NEXT: li a4, 216 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7187,13 +7040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_748: # %cond.load865 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 218 ; CHECK-RV32-NEXT: li a4, 217 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7203,13 +7056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_749: # %cond.load869 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 219 ; CHECK-RV32-NEXT: li a4, 218 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7219,13 +7072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_750: # %cond.load873 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 220 ; CHECK-RV32-NEXT: li a4, 219 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7235,13 +7088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: .LBB61_751: # %cond.load877 ; CHECK-RV32-NEXT: lbu a2, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 -; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 221 ; CHECK-RV32-NEXT: li a4, 220 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -7252,479 +7105,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_236 ; CHECK-RV32-NEXT: .LBB61_752: # %cond.load889 ; CHECK-RV32-NEXT: lbu a3, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 224 ; CHECK-RV32-NEXT: li a4, 223 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_753 ; CHECK-RV32-NEXT: j .LBB61_240 ; CHECK-RV32-NEXT: .LBB61_753: # %cond.load893 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 225 ; CHECK-RV32-NEXT: li a4, 224 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_754 ; CHECK-RV32-NEXT: j .LBB61_241 ; CHECK-RV32-NEXT: .LBB61_754: # %cond.load897 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 226 ; CHECK-RV32-NEXT: li a4, 225 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_755 ; CHECK-RV32-NEXT: j .LBB61_242 ; CHECK-RV32-NEXT: .LBB61_755: # %cond.load901 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 227 ; CHECK-RV32-NEXT: li a4, 226 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_756 ; CHECK-RV32-NEXT: j .LBB61_243 ; CHECK-RV32-NEXT: .LBB61_756: # %cond.load905 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 228 ; CHECK-RV32-NEXT: li a4, 227 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_757 ; CHECK-RV32-NEXT: j .LBB61_244 ; CHECK-RV32-NEXT: .LBB61_757: # %cond.load909 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 229 ; CHECK-RV32-NEXT: li a4, 228 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_758 ; CHECK-RV32-NEXT: j .LBB61_245 ; CHECK-RV32-NEXT: .LBB61_758: # %cond.load913 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 230 ; CHECK-RV32-NEXT: li a4, 229 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_759 ; CHECK-RV32-NEXT: j .LBB61_246 ; CHECK-RV32-NEXT: .LBB61_759: # %cond.load917 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 231 ; CHECK-RV32-NEXT: li a4, 230 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_760 ; CHECK-RV32-NEXT: j .LBB61_247 ; CHECK-RV32-NEXT: .LBB61_760: # %cond.load921 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 232 ; CHECK-RV32-NEXT: li a4, 231 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_761 ; CHECK-RV32-NEXT: j .LBB61_248 ; CHECK-RV32-NEXT: .LBB61_761: # %cond.load925 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 233 ; CHECK-RV32-NEXT: li a4, 232 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_762 ; CHECK-RV32-NEXT: j .LBB61_249 ; CHECK-RV32-NEXT: .LBB61_762: # %cond.load929 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 234 ; CHECK-RV32-NEXT: li a4, 233 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_763 ; CHECK-RV32-NEXT: j .LBB61_250 ; CHECK-RV32-NEXT: .LBB61_763: # %cond.load933 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 235 ; CHECK-RV32-NEXT: li a4, 234 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_764 ; CHECK-RV32-NEXT: j .LBB61_251 ; CHECK-RV32-NEXT: .LBB61_764: # %cond.load937 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 236 ; CHECK-RV32-NEXT: li a4, 235 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_765 ; CHECK-RV32-NEXT: j .LBB61_252 ; CHECK-RV32-NEXT: .LBB61_765: # %cond.load941 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 237 ; CHECK-RV32-NEXT: li a4, 236 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_766 ; CHECK-RV32-NEXT: j .LBB61_253 ; CHECK-RV32-NEXT: .LBB61_766: # %cond.load945 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 238 ; CHECK-RV32-NEXT: li a4, 237 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_767 ; CHECK-RV32-NEXT: j .LBB61_254 ; CHECK-RV32-NEXT: .LBB61_767: # %cond.load949 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 239 ; CHECK-RV32-NEXT: li a4, 238 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_768 ; CHECK-RV32-NEXT: j .LBB61_255 ; CHECK-RV32-NEXT: .LBB61_768: # %cond.load953 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 240 ; CHECK-RV32-NEXT: li a4, 239 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_769 ; CHECK-RV32-NEXT: j .LBB61_256 ; CHECK-RV32-NEXT: .LBB61_769: # %cond.load957 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 241 ; CHECK-RV32-NEXT: li a4, 240 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_770 ; CHECK-RV32-NEXT: j .LBB61_257 ; CHECK-RV32-NEXT: .LBB61_770: # %cond.load961 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 242 ; CHECK-RV32-NEXT: li a4, 241 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_771 ; CHECK-RV32-NEXT: j .LBB61_258 ; CHECK-RV32-NEXT: .LBB61_771: # %cond.load965 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 243 ; CHECK-RV32-NEXT: li a4, 242 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_772 ; CHECK-RV32-NEXT: j .LBB61_259 ; CHECK-RV32-NEXT: .LBB61_772: # %cond.load969 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 244 ; CHECK-RV32-NEXT: li a4, 243 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_773 ; CHECK-RV32-NEXT: j .LBB61_260 ; CHECK-RV32-NEXT: .LBB61_773: # %cond.load973 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 245 ; CHECK-RV32-NEXT: li a4, 244 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_774 ; CHECK-RV32-NEXT: j .LBB61_261 ; CHECK-RV32-NEXT: .LBB61_774: # %cond.load977 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 246 ; CHECK-RV32-NEXT: li a4, 245 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_775 ; CHECK-RV32-NEXT: j .LBB61_262 ; CHECK-RV32-NEXT: .LBB61_775: # %cond.load981 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 247 ; CHECK-RV32-NEXT: li a4, 246 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_776 ; CHECK-RV32-NEXT: j .LBB61_263 ; CHECK-RV32-NEXT: .LBB61_776: # %cond.load985 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 248 ; CHECK-RV32-NEXT: li a4, 247 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_777 ; CHECK-RV32-NEXT: j .LBB61_264 ; CHECK-RV32-NEXT: .LBB61_777: # %cond.load989 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 249 ; CHECK-RV32-NEXT: li a4, 248 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_778 ; CHECK-RV32-NEXT: j .LBB61_265 ; CHECK-RV32-NEXT: .LBB61_778: # %cond.load993 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 250 ; CHECK-RV32-NEXT: li a4, 249 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_779 ; CHECK-RV32-NEXT: j .LBB61_266 ; CHECK-RV32-NEXT: .LBB61_779: # %cond.load997 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 251 ; CHECK-RV32-NEXT: li a4, 250 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_780 ; CHECK-RV32-NEXT: j .LBB61_267 ; CHECK-RV32-NEXT: .LBB61_780: # %cond.load1001 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 252 ; CHECK-RV32-NEXT: li a4, 251 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_781 ; CHECK-RV32-NEXT: j .LBB61_268 ; CHECK-RV32-NEXT: .LBB61_781: # %cond.load1005 ; CHECK-RV32-NEXT: lbu a3, 0(a0) ; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma -; CHECK-RV32-NEXT: vmv.s.x v16, a3 -; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a3 ; CHECK-RV32-NEXT: li a3, 253 ; CHECK-RV32-NEXT: li a4, 252 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 -; CHECK-RV32-NEXT: vmv4r.v v24, v8 -; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1032 ; CHECK-RV32-NEXT: j .LBB61_269 @@ -7732,12 +7585,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: j .LBB61_270 ; CHECK-RV32-NEXT: .LBB61_782: # %cond.load1017 ; CHECK-RV32-NEXT: lbu a2, 0(a0) -; CHECK-RV32-NEXT: vmv.s.x v20, a2 ; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vmv.s.x v12, a2 ; CHECK-RV32-NEXT: li a2, 256 ; CHECK-RV32-NEXT: li a4, 255 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 @@ -11138,13 +10991,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_62: # %cond.load241 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 62 ; CHECK-RV64-NEXT: li a3, 61 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -11155,12 +11008,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: bgez a1, .LBB61_65 ; CHECK-RV64-NEXT: # %bb.64: # %cond.load245 ; CHECK-RV64-NEXT: lbu a1, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v17, a1 ; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 63 ; CHECK-RV64-NEXT: li a3, 62 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v17, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v24, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 @@ -11419,13 +11272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_128: # %cond.load497 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 126 ; CHECK-RV64-NEXT: li a3, 125 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -11436,12 +11289,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: bgez a2, .LBB61_131 ; CHECK-RV64-NEXT: # %bb.130: # %cond.load501 ; CHECK-RV64-NEXT: lbu a2, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v18, a2 ; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 127 ; CHECK-RV64-NEXT: li a3, 126 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v18, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v24, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 @@ -11700,16 +11553,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_194: # %cond.load753 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 190 ; CHECK-RV64-NEXT: li a3, 189 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_195: # %else754 ; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11717,12 +11570,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: bgez a1, .LBB61_197 ; CHECK-RV64-NEXT: # %bb.196: # %cond.load757 ; CHECK-RV64-NEXT: lbu a1, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v20, a1 ; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 191 ; CHECK-RV64-NEXT: li a3, 190 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v20, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v24, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 @@ -11981,16 +11834,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_260: # %cond.load1009 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 254 ; CHECK-RV64-NEXT: li a3, 253 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_261: # %else1010 ; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11998,12 +11851,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: bgez a2, .LBB61_263 ; CHECK-RV64-NEXT: # %bb.262: # %cond.load1013 ; CHECK-RV64-NEXT: lbu a2, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v20, a2 ; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 255 ; CHECK-RV64-NEXT: li a3, 254 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v20, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v24, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 @@ -13107,374 +12960,374 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: j .LBB61_2 ; CHECK-RV64-NEXT: .LBB61_528: # %cond.load1 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 1 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 4 ; CHECK-RV64-NEXT: bnez a1, .LBB61_529 ; CHECK-RV64-NEXT: j .LBB61_3 ; CHECK-RV64-NEXT: .LBB61_529: # %cond.load5 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 2 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 2 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 8 ; CHECK-RV64-NEXT: bnez a1, .LBB61_530 ; CHECK-RV64-NEXT: j .LBB61_4 ; CHECK-RV64-NEXT: .LBB61_530: # %cond.load9 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 3 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 16 ; CHECK-RV64-NEXT: bnez a1, .LBB61_531 ; CHECK-RV64-NEXT: j .LBB61_5 ; CHECK-RV64-NEXT: .LBB61_531: # %cond.load13 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 5, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 4 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 4 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 32 ; CHECK-RV64-NEXT: bnez a1, .LBB61_532 ; CHECK-RV64-NEXT: j .LBB61_6 ; CHECK-RV64-NEXT: .LBB61_532: # %cond.load17 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 5 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 5 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 64 ; CHECK-RV64-NEXT: bnez a1, .LBB61_533 ; CHECK-RV64-NEXT: j .LBB61_7 ; CHECK-RV64-NEXT: .LBB61_533: # %cond.load21 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 7, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 6 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 6 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 128 ; CHECK-RV64-NEXT: bnez a1, .LBB61_534 ; CHECK-RV64-NEXT: j .LBB61_8 ; CHECK-RV64-NEXT: .LBB61_534: # %cond.load25 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 7 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 7 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 256 ; CHECK-RV64-NEXT: bnez a1, .LBB61_535 ; CHECK-RV64-NEXT: j .LBB61_9 ; CHECK-RV64-NEXT: .LBB61_535: # %cond.load29 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 9, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 8 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 8 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 512 ; CHECK-RV64-NEXT: bnez a1, .LBB61_536 ; CHECK-RV64-NEXT: j .LBB61_10 ; CHECK-RV64-NEXT: .LBB61_536: # %cond.load33 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 9 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 9 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1024 ; CHECK-RV64-NEXT: bnez a1, .LBB61_537 ; CHECK-RV64-NEXT: j .LBB61_11 ; CHECK-RV64-NEXT: .LBB61_537: # %cond.load37 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 11, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 10 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 10 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 52 ; CHECK-RV64-NEXT: bltz a1, .LBB61_538 ; CHECK-RV64-NEXT: j .LBB61_12 ; CHECK-RV64-NEXT: .LBB61_538: # %cond.load41 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 11 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 11 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 51 ; CHECK-RV64-NEXT: bltz a1, .LBB61_539 ; CHECK-RV64-NEXT: j .LBB61_13 ; CHECK-RV64-NEXT: .LBB61_539: # %cond.load45 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 13, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 12 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 12 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 50 ; CHECK-RV64-NEXT: bltz a1, .LBB61_540 ; CHECK-RV64-NEXT: j .LBB61_14 ; CHECK-RV64-NEXT: .LBB61_540: # %cond.load49 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 13 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 13 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 49 ; CHECK-RV64-NEXT: bltz a1, .LBB61_541 ; CHECK-RV64-NEXT: j .LBB61_15 ; CHECK-RV64-NEXT: .LBB61_541: # %cond.load53 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 15, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 14 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 14 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 48 ; CHECK-RV64-NEXT: bltz a1, .LBB61_542 ; CHECK-RV64-NEXT: j .LBB61_16 ; CHECK-RV64-NEXT: .LBB61_542: # %cond.load57 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 15 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 15 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 47 ; CHECK-RV64-NEXT: bltz a1, .LBB61_543 ; CHECK-RV64-NEXT: j .LBB61_17 ; CHECK-RV64-NEXT: .LBB61_543: # %cond.load61 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 17, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 16 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 16 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 46 ; CHECK-RV64-NEXT: bltz a1, .LBB61_544 ; CHECK-RV64-NEXT: j .LBB61_18 ; CHECK-RV64-NEXT: .LBB61_544: # %cond.load65 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 18, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 17 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 17 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 45 ; CHECK-RV64-NEXT: bltz a1, .LBB61_545 ; CHECK-RV64-NEXT: j .LBB61_19 ; CHECK-RV64-NEXT: .LBB61_545: # %cond.load69 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 19, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 18 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 18 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 44 ; CHECK-RV64-NEXT: bltz a1, .LBB61_546 ; CHECK-RV64-NEXT: j .LBB61_20 ; CHECK-RV64-NEXT: .LBB61_546: # %cond.load73 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 20, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 19 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 19 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 43 ; CHECK-RV64-NEXT: bltz a1, .LBB61_547 ; CHECK-RV64-NEXT: j .LBB61_21 ; CHECK-RV64-NEXT: .LBB61_547: # %cond.load77 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 21, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 20 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 20 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 42 ; CHECK-RV64-NEXT: bltz a1, .LBB61_548 ; CHECK-RV64-NEXT: j .LBB61_22 ; CHECK-RV64-NEXT: .LBB61_548: # %cond.load81 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 22, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 21 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 21 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 41 ; CHECK-RV64-NEXT: bltz a1, .LBB61_549 ; CHECK-RV64-NEXT: j .LBB61_23 ; CHECK-RV64-NEXT: .LBB61_549: # %cond.load85 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 23, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 22 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 22 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 40 ; CHECK-RV64-NEXT: bltz a1, .LBB61_550 ; CHECK-RV64-NEXT: j .LBB61_24 ; CHECK-RV64-NEXT: .LBB61_550: # %cond.load89 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 24, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 23 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 23 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 39 ; CHECK-RV64-NEXT: bltz a1, .LBB61_551 ; CHECK-RV64-NEXT: j .LBB61_25 ; CHECK-RV64-NEXT: .LBB61_551: # %cond.load93 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 25, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 24 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 24 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 38 ; CHECK-RV64-NEXT: bltz a1, .LBB61_552 ; CHECK-RV64-NEXT: j .LBB61_26 ; CHECK-RV64-NEXT: .LBB61_552: # %cond.load97 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 26, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 25 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 25 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 37 ; CHECK-RV64-NEXT: bltz a1, .LBB61_553 ; CHECK-RV64-NEXT: j .LBB61_27 ; CHECK-RV64-NEXT: .LBB61_553: # %cond.load101 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 27, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 26 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 26 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 36 ; CHECK-RV64-NEXT: bltz a1, .LBB61_554 ; CHECK-RV64-NEXT: j .LBB61_28 ; CHECK-RV64-NEXT: .LBB61_554: # %cond.load105 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 28, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 27 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 27 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 35 ; CHECK-RV64-NEXT: bltz a1, .LBB61_555 ; CHECK-RV64-NEXT: j .LBB61_29 ; CHECK-RV64-NEXT: .LBB61_555: # %cond.load109 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 29, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 28 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 28 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 34 ; CHECK-RV64-NEXT: bltz a1, .LBB61_556 ; CHECK-RV64-NEXT: j .LBB61_30 ; CHECK-RV64-NEXT: .LBB61_556: # %cond.load113 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 30, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 29 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 29 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 33 ; CHECK-RV64-NEXT: bltz a1, .LBB61_557 ; CHECK-RV64-NEXT: j .LBB61_31 ; CHECK-RV64-NEXT: .LBB61_557: # %cond.load117 ; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetivli zero, 31, e8, m1, tu, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 -; CHECK-RV64-NEXT: vslideup.vi v8, v16, 30 +; CHECK-RV64-NEXT: vmv.s.x v9, a1 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 30 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 32 ; CHECK-RV64-NEXT: bltz a1, .LBB61_558 ; CHECK-RV64-NEXT: j .LBB61_32 ; CHECK-RV64-NEXT: .LBB61_558: # %cond.load121 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 32 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vi v8, v24, 31 +; CHECK-RV64-NEXT: vslideup.vi v8, v9, 31 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13484,13 +13337,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_559: # %cond.load125 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 33 ; CHECK-RV64-NEXT: li a3, 32 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13500,13 +13353,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_560: # %cond.load129 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 34 ; CHECK-RV64-NEXT: li a3, 33 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13516,13 +13369,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_561: # %cond.load133 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 35 ; CHECK-RV64-NEXT: li a3, 34 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13532,13 +13385,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_562: # %cond.load137 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 36 ; CHECK-RV64-NEXT: li a3, 35 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13548,13 +13401,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_563: # %cond.load141 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 37 ; CHECK-RV64-NEXT: li a3, 36 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13564,13 +13417,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_564: # %cond.load145 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 38 ; CHECK-RV64-NEXT: li a3, 37 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13580,13 +13433,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_565: # %cond.load149 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 39 ; CHECK-RV64-NEXT: li a3, 38 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13596,13 +13449,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_566: # %cond.load153 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 40 ; CHECK-RV64-NEXT: li a3, 39 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13612,13 +13465,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_567: # %cond.load157 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 41 ; CHECK-RV64-NEXT: li a3, 40 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13628,13 +13481,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_568: # %cond.load161 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 42 ; CHECK-RV64-NEXT: li a3, 41 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13644,13 +13497,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_569: # %cond.load165 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 43 ; CHECK-RV64-NEXT: li a3, 42 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13660,13 +13513,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_570: # %cond.load169 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 44 ; CHECK-RV64-NEXT: li a3, 43 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13676,13 +13529,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_571: # %cond.load173 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 45 ; CHECK-RV64-NEXT: li a3, 44 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13692,13 +13545,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_572: # %cond.load177 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 46 ; CHECK-RV64-NEXT: li a3, 45 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13708,13 +13561,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_573: # %cond.load181 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 47 ; CHECK-RV64-NEXT: li a3, 46 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13724,13 +13577,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_574: # %cond.load185 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 48 ; CHECK-RV64-NEXT: li a3, 47 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13740,13 +13593,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_575: # %cond.load189 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 49 ; CHECK-RV64-NEXT: li a3, 48 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13756,13 +13609,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_576: # %cond.load193 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 50 ; CHECK-RV64-NEXT: li a3, 49 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13772,13 +13625,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_577: # %cond.load197 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 51 ; CHECK-RV64-NEXT: li a3, 50 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13788,13 +13641,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_578: # %cond.load201 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 52 ; CHECK-RV64-NEXT: li a3, 51 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13804,13 +13657,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_579: # %cond.load205 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 53 ; CHECK-RV64-NEXT: li a3, 52 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13820,13 +13673,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_580: # %cond.load209 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 54 ; CHECK-RV64-NEXT: li a3, 53 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13836,13 +13689,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_581: # %cond.load213 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 55 ; CHECK-RV64-NEXT: li a3, 54 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13852,13 +13705,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_582: # %cond.load217 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 56 ; CHECK-RV64-NEXT: li a3, 55 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13868,13 +13721,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_583: # %cond.load221 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 57 ; CHECK-RV64-NEXT: li a3, 56 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13884,13 +13737,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_584: # %cond.load225 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 58 ; CHECK-RV64-NEXT: li a3, 57 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13900,13 +13753,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_585: # %cond.load229 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 59 ; CHECK-RV64-NEXT: li a3, 58 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13916,13 +13769,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_586: # %cond.load233 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 60 ; CHECK-RV64-NEXT: li a3, 59 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13932,13 +13785,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_587: # %cond.load237 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: li a1, 61 ; CHECK-RV64-NEXT: li a3, 60 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13949,12 +13802,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: j .LBB61_63 ; CHECK-RV64-NEXT: .LBB61_588: # %cond.load249 ; CHECK-RV64-NEXT: lbu a2, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vmv.s.x v9, a2 ; CHECK-RV64-NEXT: li a2, 64 ; CHECK-RV64-NEXT: li a3, 63 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13964,13 +13817,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_589: # %cond.load253 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 65 ; CHECK-RV64-NEXT: li a3, 64 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13980,13 +13833,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_590: # %cond.load257 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 66 ; CHECK-RV64-NEXT: li a3, 65 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -13996,13 +13849,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_591: # %cond.load261 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 67 ; CHECK-RV64-NEXT: li a3, 66 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14012,13 +13865,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_592: # %cond.load265 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 68 ; CHECK-RV64-NEXT: li a3, 67 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14028,13 +13881,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_593: # %cond.load269 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 69 ; CHECK-RV64-NEXT: li a3, 68 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14044,13 +13897,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_594: # %cond.load273 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 70 ; CHECK-RV64-NEXT: li a3, 69 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14060,13 +13913,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_595: # %cond.load277 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 71 ; CHECK-RV64-NEXT: li a3, 70 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14076,13 +13929,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_596: # %cond.load281 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 72 ; CHECK-RV64-NEXT: li a3, 71 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14092,13 +13945,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_597: # %cond.load285 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 73 ; CHECK-RV64-NEXT: li a3, 72 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14108,13 +13961,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_598: # %cond.load289 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 74 ; CHECK-RV64-NEXT: li a3, 73 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14124,13 +13977,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_599: # %cond.load293 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 75 ; CHECK-RV64-NEXT: li a3, 74 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14140,13 +13993,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_600: # %cond.load297 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 76 ; CHECK-RV64-NEXT: li a3, 75 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14156,13 +14009,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_601: # %cond.load301 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 77 ; CHECK-RV64-NEXT: li a3, 76 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14172,13 +14025,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_602: # %cond.load305 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 78 ; CHECK-RV64-NEXT: li a3, 77 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14188,13 +14041,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_603: # %cond.load309 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 79 ; CHECK-RV64-NEXT: li a3, 78 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14204,13 +14057,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_604: # %cond.load313 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 80 ; CHECK-RV64-NEXT: li a3, 79 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14220,13 +14073,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_605: # %cond.load317 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 81 ; CHECK-RV64-NEXT: li a3, 80 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14236,13 +14089,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_606: # %cond.load321 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 82 ; CHECK-RV64-NEXT: li a3, 81 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14252,13 +14105,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_607: # %cond.load325 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 83 ; CHECK-RV64-NEXT: li a3, 82 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14268,13 +14121,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_608: # %cond.load329 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 84 ; CHECK-RV64-NEXT: li a3, 83 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14284,13 +14137,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_609: # %cond.load333 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 85 ; CHECK-RV64-NEXT: li a3, 84 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14300,13 +14153,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_610: # %cond.load337 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 86 ; CHECK-RV64-NEXT: li a3, 85 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14316,13 +14169,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_611: # %cond.load341 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 87 ; CHECK-RV64-NEXT: li a3, 86 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14332,13 +14185,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_612: # %cond.load345 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 88 ; CHECK-RV64-NEXT: li a3, 87 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14348,13 +14201,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_613: # %cond.load349 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 89 ; CHECK-RV64-NEXT: li a3, 88 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14364,13 +14217,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_614: # %cond.load353 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 90 ; CHECK-RV64-NEXT: li a3, 89 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14380,13 +14233,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_615: # %cond.load357 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 91 ; CHECK-RV64-NEXT: li a3, 90 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14396,13 +14249,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_616: # %cond.load361 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 92 ; CHECK-RV64-NEXT: li a3, 91 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14412,13 +14265,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_617: # %cond.load365 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 93 ; CHECK-RV64-NEXT: li a3, 92 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14428,13 +14281,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_618: # %cond.load369 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 94 ; CHECK-RV64-NEXT: li a3, 93 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14444,13 +14297,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_619: # %cond.load373 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 95 ; CHECK-RV64-NEXT: li a3, 94 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14460,13 +14313,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_620: # %cond.load377 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 96 ; CHECK-RV64-NEXT: li a3, 95 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14476,13 +14329,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_621: # %cond.load381 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 97 ; CHECK-RV64-NEXT: li a3, 96 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14492,13 +14345,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_622: # %cond.load385 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 98 ; CHECK-RV64-NEXT: li a3, 97 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14508,13 +14361,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_623: # %cond.load389 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 99 ; CHECK-RV64-NEXT: li a3, 98 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14524,13 +14377,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_624: # %cond.load393 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 100 ; CHECK-RV64-NEXT: li a3, 99 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14540,13 +14393,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_625: # %cond.load397 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 101 ; CHECK-RV64-NEXT: li a3, 100 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14556,13 +14409,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_626: # %cond.load401 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 102 ; CHECK-RV64-NEXT: li a3, 101 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14572,13 +14425,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_627: # %cond.load405 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 103 ; CHECK-RV64-NEXT: li a3, 102 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14588,13 +14441,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_628: # %cond.load409 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 104 ; CHECK-RV64-NEXT: li a3, 103 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14604,13 +14457,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_629: # %cond.load413 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 105 ; CHECK-RV64-NEXT: li a3, 104 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14620,13 +14473,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_630: # %cond.load417 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 106 ; CHECK-RV64-NEXT: li a3, 105 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14636,13 +14489,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_631: # %cond.load421 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 107 ; CHECK-RV64-NEXT: li a3, 106 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14652,13 +14505,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_632: # %cond.load425 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 108 ; CHECK-RV64-NEXT: li a3, 107 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14668,13 +14521,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_633: # %cond.load429 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 109 ; CHECK-RV64-NEXT: li a3, 108 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14684,13 +14537,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_634: # %cond.load433 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 110 ; CHECK-RV64-NEXT: li a3, 109 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14700,13 +14553,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_635: # %cond.load437 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 111 ; CHECK-RV64-NEXT: li a3, 110 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14716,13 +14569,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_636: # %cond.load441 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 112 ; CHECK-RV64-NEXT: li a3, 111 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14732,13 +14585,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_637: # %cond.load445 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 113 ; CHECK-RV64-NEXT: li a3, 112 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14748,13 +14601,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_638: # %cond.load449 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 114 ; CHECK-RV64-NEXT: li a3, 113 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14764,13 +14617,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_639: # %cond.load453 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 115 ; CHECK-RV64-NEXT: li a3, 114 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14780,13 +14633,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_640: # %cond.load457 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 116 ; CHECK-RV64-NEXT: li a3, 115 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14796,13 +14649,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_641: # %cond.load461 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 117 ; CHECK-RV64-NEXT: li a3, 116 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14812,13 +14665,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_642: # %cond.load465 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 118 ; CHECK-RV64-NEXT: li a3, 117 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14828,13 +14681,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_643: # %cond.load469 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 119 ; CHECK-RV64-NEXT: li a3, 118 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14844,13 +14697,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_644: # %cond.load473 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 120 ; CHECK-RV64-NEXT: li a3, 119 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14860,13 +14713,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_645: # %cond.load477 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 121 ; CHECK-RV64-NEXT: li a3, 120 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14876,13 +14729,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_646: # %cond.load481 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 122 ; CHECK-RV64-NEXT: li a3, 121 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14892,13 +14745,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_647: # %cond.load485 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 123 ; CHECK-RV64-NEXT: li a3, 122 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14908,13 +14761,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_648: # %cond.load489 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 124 ; CHECK-RV64-NEXT: li a3, 123 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14924,13 +14777,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_649: # %cond.load493 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 -; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v24, a2 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v10, a2 ; CHECK-RV64-NEXT: li a2, 125 ; CHECK-RV64-NEXT: li a3, 124 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14941,12 +14794,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: j .LBB61_129 ; CHECK-RV64-NEXT: .LBB61_650: # %cond.load505 ; CHECK-RV64-NEXT: lbu a1, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v24, a1 ; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vmv.s.x v10, a1 ; CHECK-RV64-NEXT: li a1, 128 ; CHECK-RV64-NEXT: li a3, 127 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 @@ -14956,976 +14809,976 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: .LBB61_651: # %cond.load509 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 129 ; CHECK-RV64-NEXT: li a3, 128 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 2 ; CHECK-RV64-NEXT: bnez a1, .LBB61_652 ; CHECK-RV64-NEXT: j .LBB61_134 ; CHECK-RV64-NEXT: .LBB61_652: # %cond.load513 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 130 ; CHECK-RV64-NEXT: li a3, 129 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 4 ; CHECK-RV64-NEXT: bnez a1, .LBB61_653 ; CHECK-RV64-NEXT: j .LBB61_135 ; CHECK-RV64-NEXT: .LBB61_653: # %cond.load517 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 131 ; CHECK-RV64-NEXT: li a3, 130 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 8 ; CHECK-RV64-NEXT: bnez a1, .LBB61_654 ; CHECK-RV64-NEXT: j .LBB61_136 ; CHECK-RV64-NEXT: .LBB61_654: # %cond.load521 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 132 ; CHECK-RV64-NEXT: li a3, 131 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 16 ; CHECK-RV64-NEXT: bnez a1, .LBB61_655 ; CHECK-RV64-NEXT: j .LBB61_137 ; CHECK-RV64-NEXT: .LBB61_655: # %cond.load525 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 133 ; CHECK-RV64-NEXT: li a3, 132 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 32 ; CHECK-RV64-NEXT: bnez a1, .LBB61_656 ; CHECK-RV64-NEXT: j .LBB61_138 ; CHECK-RV64-NEXT: .LBB61_656: # %cond.load529 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 134 ; CHECK-RV64-NEXT: li a3, 133 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 64 ; CHECK-RV64-NEXT: bnez a1, .LBB61_657 ; CHECK-RV64-NEXT: j .LBB61_139 ; CHECK-RV64-NEXT: .LBB61_657: # %cond.load533 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 135 ; CHECK-RV64-NEXT: li a3, 134 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 128 ; CHECK-RV64-NEXT: bnez a1, .LBB61_658 ; CHECK-RV64-NEXT: j .LBB61_140 ; CHECK-RV64-NEXT: .LBB61_658: # %cond.load537 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 136 ; CHECK-RV64-NEXT: li a3, 135 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 256 ; CHECK-RV64-NEXT: bnez a1, .LBB61_659 ; CHECK-RV64-NEXT: j .LBB61_141 ; CHECK-RV64-NEXT: .LBB61_659: # %cond.load541 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 137 ; CHECK-RV64-NEXT: li a3, 136 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 512 ; CHECK-RV64-NEXT: bnez a1, .LBB61_660 ; CHECK-RV64-NEXT: j .LBB61_142 ; CHECK-RV64-NEXT: .LBB61_660: # %cond.load545 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 138 ; CHECK-RV64-NEXT: li a3, 137 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1024 ; CHECK-RV64-NEXT: bnez a1, .LBB61_661 ; CHECK-RV64-NEXT: j .LBB61_143 ; CHECK-RV64-NEXT: .LBB61_661: # %cond.load549 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 139 ; CHECK-RV64-NEXT: li a3, 138 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 52 ; CHECK-RV64-NEXT: bltz a1, .LBB61_662 ; CHECK-RV64-NEXT: j .LBB61_144 ; CHECK-RV64-NEXT: .LBB61_662: # %cond.load553 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 140 ; CHECK-RV64-NEXT: li a3, 139 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 51 ; CHECK-RV64-NEXT: bltz a1, .LBB61_663 ; CHECK-RV64-NEXT: j .LBB61_145 ; CHECK-RV64-NEXT: .LBB61_663: # %cond.load557 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 141 ; CHECK-RV64-NEXT: li a3, 140 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 50 ; CHECK-RV64-NEXT: bltz a1, .LBB61_664 ; CHECK-RV64-NEXT: j .LBB61_146 ; CHECK-RV64-NEXT: .LBB61_664: # %cond.load561 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 142 ; CHECK-RV64-NEXT: li a3, 141 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 49 ; CHECK-RV64-NEXT: bltz a1, .LBB61_665 ; CHECK-RV64-NEXT: j .LBB61_147 ; CHECK-RV64-NEXT: .LBB61_665: # %cond.load565 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 143 ; CHECK-RV64-NEXT: li a3, 142 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 48 ; CHECK-RV64-NEXT: bltz a1, .LBB61_666 ; CHECK-RV64-NEXT: j .LBB61_148 ; CHECK-RV64-NEXT: .LBB61_666: # %cond.load569 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 144 ; CHECK-RV64-NEXT: li a3, 143 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 47 ; CHECK-RV64-NEXT: bltz a1, .LBB61_667 ; CHECK-RV64-NEXT: j .LBB61_149 ; CHECK-RV64-NEXT: .LBB61_667: # %cond.load573 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 145 ; CHECK-RV64-NEXT: li a3, 144 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 46 ; CHECK-RV64-NEXT: bltz a1, .LBB61_668 ; CHECK-RV64-NEXT: j .LBB61_150 ; CHECK-RV64-NEXT: .LBB61_668: # %cond.load577 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 146 ; CHECK-RV64-NEXT: li a3, 145 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 45 ; CHECK-RV64-NEXT: bltz a1, .LBB61_669 ; CHECK-RV64-NEXT: j .LBB61_151 ; CHECK-RV64-NEXT: .LBB61_669: # %cond.load581 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 147 ; CHECK-RV64-NEXT: li a3, 146 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 44 ; CHECK-RV64-NEXT: bltz a1, .LBB61_670 ; CHECK-RV64-NEXT: j .LBB61_152 ; CHECK-RV64-NEXT: .LBB61_670: # %cond.load585 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 148 ; CHECK-RV64-NEXT: li a3, 147 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 43 ; CHECK-RV64-NEXT: bltz a1, .LBB61_671 ; CHECK-RV64-NEXT: j .LBB61_153 ; CHECK-RV64-NEXT: .LBB61_671: # %cond.load589 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 149 ; CHECK-RV64-NEXT: li a3, 148 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 42 ; CHECK-RV64-NEXT: bltz a1, .LBB61_672 ; CHECK-RV64-NEXT: j .LBB61_154 ; CHECK-RV64-NEXT: .LBB61_672: # %cond.load593 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 150 ; CHECK-RV64-NEXT: li a3, 149 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 41 ; CHECK-RV64-NEXT: bltz a1, .LBB61_673 ; CHECK-RV64-NEXT: j .LBB61_155 ; CHECK-RV64-NEXT: .LBB61_673: # %cond.load597 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 151 ; CHECK-RV64-NEXT: li a3, 150 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 40 ; CHECK-RV64-NEXT: bltz a1, .LBB61_674 ; CHECK-RV64-NEXT: j .LBB61_156 ; CHECK-RV64-NEXT: .LBB61_674: # %cond.load601 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 152 ; CHECK-RV64-NEXT: li a3, 151 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 39 ; CHECK-RV64-NEXT: bltz a1, .LBB61_675 ; CHECK-RV64-NEXT: j .LBB61_157 ; CHECK-RV64-NEXT: .LBB61_675: # %cond.load605 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 153 ; CHECK-RV64-NEXT: li a3, 152 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 38 ; CHECK-RV64-NEXT: bltz a1, .LBB61_676 ; CHECK-RV64-NEXT: j .LBB61_158 ; CHECK-RV64-NEXT: .LBB61_676: # %cond.load609 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 154 ; CHECK-RV64-NEXT: li a3, 153 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 37 ; CHECK-RV64-NEXT: bltz a1, .LBB61_677 ; CHECK-RV64-NEXT: j .LBB61_159 ; CHECK-RV64-NEXT: .LBB61_677: # %cond.load613 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 155 ; CHECK-RV64-NEXT: li a3, 154 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 36 ; CHECK-RV64-NEXT: bltz a1, .LBB61_678 ; CHECK-RV64-NEXT: j .LBB61_160 ; CHECK-RV64-NEXT: .LBB61_678: # %cond.load617 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 156 ; CHECK-RV64-NEXT: li a3, 155 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 35 ; CHECK-RV64-NEXT: bltz a1, .LBB61_679 ; CHECK-RV64-NEXT: j .LBB61_161 ; CHECK-RV64-NEXT: .LBB61_679: # %cond.load621 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 157 ; CHECK-RV64-NEXT: li a3, 156 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 34 ; CHECK-RV64-NEXT: bltz a1, .LBB61_680 ; CHECK-RV64-NEXT: j .LBB61_162 ; CHECK-RV64-NEXT: .LBB61_680: # %cond.load625 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 158 ; CHECK-RV64-NEXT: li a3, 157 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 33 ; CHECK-RV64-NEXT: bltz a1, .LBB61_681 ; CHECK-RV64-NEXT: j .LBB61_163 ; CHECK-RV64-NEXT: .LBB61_681: # %cond.load629 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 159 ; CHECK-RV64-NEXT: li a3, 158 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 32 ; CHECK-RV64-NEXT: bltz a1, .LBB61_682 ; CHECK-RV64-NEXT: j .LBB61_164 ; CHECK-RV64-NEXT: .LBB61_682: # %cond.load633 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 160 ; CHECK-RV64-NEXT: li a3, 159 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 31 ; CHECK-RV64-NEXT: bltz a1, .LBB61_683 ; CHECK-RV64-NEXT: j .LBB61_165 ; CHECK-RV64-NEXT: .LBB61_683: # %cond.load637 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 161 ; CHECK-RV64-NEXT: li a3, 160 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 30 ; CHECK-RV64-NEXT: bltz a1, .LBB61_684 ; CHECK-RV64-NEXT: j .LBB61_166 ; CHECK-RV64-NEXT: .LBB61_684: # %cond.load641 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 162 ; CHECK-RV64-NEXT: li a3, 161 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 29 ; CHECK-RV64-NEXT: bltz a1, .LBB61_685 ; CHECK-RV64-NEXT: j .LBB61_167 ; CHECK-RV64-NEXT: .LBB61_685: # %cond.load645 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 163 ; CHECK-RV64-NEXT: li a3, 162 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 28 ; CHECK-RV64-NEXT: bltz a1, .LBB61_686 ; CHECK-RV64-NEXT: j .LBB61_168 ; CHECK-RV64-NEXT: .LBB61_686: # %cond.load649 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 164 ; CHECK-RV64-NEXT: li a3, 163 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 27 ; CHECK-RV64-NEXT: bltz a1, .LBB61_687 ; CHECK-RV64-NEXT: j .LBB61_169 ; CHECK-RV64-NEXT: .LBB61_687: # %cond.load653 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 165 ; CHECK-RV64-NEXT: li a3, 164 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 26 ; CHECK-RV64-NEXT: bltz a1, .LBB61_688 ; CHECK-RV64-NEXT: j .LBB61_170 ; CHECK-RV64-NEXT: .LBB61_688: # %cond.load657 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 166 ; CHECK-RV64-NEXT: li a3, 165 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 25 ; CHECK-RV64-NEXT: bltz a1, .LBB61_689 ; CHECK-RV64-NEXT: j .LBB61_171 ; CHECK-RV64-NEXT: .LBB61_689: # %cond.load661 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 167 ; CHECK-RV64-NEXT: li a3, 166 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 24 ; CHECK-RV64-NEXT: bltz a1, .LBB61_690 ; CHECK-RV64-NEXT: j .LBB61_172 ; CHECK-RV64-NEXT: .LBB61_690: # %cond.load665 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 168 ; CHECK-RV64-NEXT: li a3, 167 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 23 ; CHECK-RV64-NEXT: bltz a1, .LBB61_691 ; CHECK-RV64-NEXT: j .LBB61_173 ; CHECK-RV64-NEXT: .LBB61_691: # %cond.load669 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 169 ; CHECK-RV64-NEXT: li a3, 168 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 22 ; CHECK-RV64-NEXT: bltz a1, .LBB61_692 ; CHECK-RV64-NEXT: j .LBB61_174 ; CHECK-RV64-NEXT: .LBB61_692: # %cond.load673 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 170 ; CHECK-RV64-NEXT: li a3, 169 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 21 ; CHECK-RV64-NEXT: bltz a1, .LBB61_693 ; CHECK-RV64-NEXT: j .LBB61_175 ; CHECK-RV64-NEXT: .LBB61_693: # %cond.load677 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 171 ; CHECK-RV64-NEXT: li a3, 170 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 20 ; CHECK-RV64-NEXT: bltz a1, .LBB61_694 ; CHECK-RV64-NEXT: j .LBB61_176 ; CHECK-RV64-NEXT: .LBB61_694: # %cond.load681 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 172 ; CHECK-RV64-NEXT: li a3, 171 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 19 ; CHECK-RV64-NEXT: bltz a1, .LBB61_695 ; CHECK-RV64-NEXT: j .LBB61_177 ; CHECK-RV64-NEXT: .LBB61_695: # %cond.load685 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 173 ; CHECK-RV64-NEXT: li a3, 172 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 18 ; CHECK-RV64-NEXT: bltz a1, .LBB61_696 ; CHECK-RV64-NEXT: j .LBB61_178 ; CHECK-RV64-NEXT: .LBB61_696: # %cond.load689 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 174 ; CHECK-RV64-NEXT: li a3, 173 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 17 ; CHECK-RV64-NEXT: bltz a1, .LBB61_697 ; CHECK-RV64-NEXT: j .LBB61_179 ; CHECK-RV64-NEXT: .LBB61_697: # %cond.load693 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 175 ; CHECK-RV64-NEXT: li a3, 174 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 16 ; CHECK-RV64-NEXT: bltz a1, .LBB61_698 ; CHECK-RV64-NEXT: j .LBB61_180 ; CHECK-RV64-NEXT: .LBB61_698: # %cond.load697 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 176 ; CHECK-RV64-NEXT: li a3, 175 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 15 ; CHECK-RV64-NEXT: bltz a1, .LBB61_699 ; CHECK-RV64-NEXT: j .LBB61_181 ; CHECK-RV64-NEXT: .LBB61_699: # %cond.load701 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 177 ; CHECK-RV64-NEXT: li a3, 176 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 14 ; CHECK-RV64-NEXT: bltz a1, .LBB61_700 ; CHECK-RV64-NEXT: j .LBB61_182 ; CHECK-RV64-NEXT: .LBB61_700: # %cond.load705 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 178 ; CHECK-RV64-NEXT: li a3, 177 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 13 ; CHECK-RV64-NEXT: bltz a1, .LBB61_701 ; CHECK-RV64-NEXT: j .LBB61_183 ; CHECK-RV64-NEXT: .LBB61_701: # %cond.load709 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 179 ; CHECK-RV64-NEXT: li a3, 178 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 12 ; CHECK-RV64-NEXT: bltz a1, .LBB61_702 ; CHECK-RV64-NEXT: j .LBB61_184 ; CHECK-RV64-NEXT: .LBB61_702: # %cond.load713 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 180 ; CHECK-RV64-NEXT: li a3, 179 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 11 ; CHECK-RV64-NEXT: bltz a1, .LBB61_703 ; CHECK-RV64-NEXT: j .LBB61_185 ; CHECK-RV64-NEXT: .LBB61_703: # %cond.load717 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 181 ; CHECK-RV64-NEXT: li a3, 180 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 10 ; CHECK-RV64-NEXT: bltz a1, .LBB61_704 ; CHECK-RV64-NEXT: j .LBB61_186 ; CHECK-RV64-NEXT: .LBB61_704: # %cond.load721 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 182 ; CHECK-RV64-NEXT: li a3, 181 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 9 ; CHECK-RV64-NEXT: bltz a1, .LBB61_705 ; CHECK-RV64-NEXT: j .LBB61_187 ; CHECK-RV64-NEXT: .LBB61_705: # %cond.load725 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 183 ; CHECK-RV64-NEXT: li a3, 182 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 8 ; CHECK-RV64-NEXT: bltz a1, .LBB61_706 ; CHECK-RV64-NEXT: j .LBB61_188 ; CHECK-RV64-NEXT: .LBB61_706: # %cond.load729 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 184 ; CHECK-RV64-NEXT: li a3, 183 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 7 ; CHECK-RV64-NEXT: bltz a1, .LBB61_707 ; CHECK-RV64-NEXT: j .LBB61_189 ; CHECK-RV64-NEXT: .LBB61_707: # %cond.load733 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 185 ; CHECK-RV64-NEXT: li a3, 184 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 6 ; CHECK-RV64-NEXT: bltz a1, .LBB61_708 ; CHECK-RV64-NEXT: j .LBB61_190 ; CHECK-RV64-NEXT: .LBB61_708: # %cond.load737 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 186 ; CHECK-RV64-NEXT: li a3, 185 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 5 ; CHECK-RV64-NEXT: bltz a1, .LBB61_709 ; CHECK-RV64-NEXT: j .LBB61_191 ; CHECK-RV64-NEXT: .LBB61_709: # %cond.load741 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 187 ; CHECK-RV64-NEXT: li a3, 186 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 4 ; CHECK-RV64-NEXT: bltz a1, .LBB61_710 ; CHECK-RV64-NEXT: j .LBB61_192 ; CHECK-RV64-NEXT: .LBB61_710: # %cond.load745 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 188 ; CHECK-RV64-NEXT: li a3, 187 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 3 ; CHECK-RV64-NEXT: bltz a1, .LBB61_711 ; CHECK-RV64-NEXT: j .LBB61_193 ; CHECK-RV64-NEXT: .LBB61_711: # %cond.load749 ; CHECK-RV64-NEXT: lbu a1, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 189 ; CHECK-RV64-NEXT: li a3, 188 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 2 ; CHECK-RV64-NEXT: bgez a1, .LBB61_1027 ; CHECK-RV64-NEXT: j .LBB61_194 @@ -15933,991 +15786,991 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: j .LBB61_195 ; CHECK-RV64-NEXT: .LBB61_712: # %cond.load761 ; CHECK-RV64-NEXT: lbu a2, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 192 ; CHECK-RV64-NEXT: li a3, 191 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1 ; CHECK-RV64-NEXT: bnez a2, .LBB61_713 ; CHECK-RV64-NEXT: j .LBB61_199 ; CHECK-RV64-NEXT: .LBB61_713: # %cond.load765 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 193 ; CHECK-RV64-NEXT: li a3, 192 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 2 ; CHECK-RV64-NEXT: bnez a2, .LBB61_714 ; CHECK-RV64-NEXT: j .LBB61_200 ; CHECK-RV64-NEXT: .LBB61_714: # %cond.load769 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 194 ; CHECK-RV64-NEXT: li a3, 193 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 4 ; CHECK-RV64-NEXT: bnez a2, .LBB61_715 ; CHECK-RV64-NEXT: j .LBB61_201 ; CHECK-RV64-NEXT: .LBB61_715: # %cond.load773 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 195 ; CHECK-RV64-NEXT: li a3, 194 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 8 ; CHECK-RV64-NEXT: bnez a2, .LBB61_716 ; CHECK-RV64-NEXT: j .LBB61_202 ; CHECK-RV64-NEXT: .LBB61_716: # %cond.load777 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 196 ; CHECK-RV64-NEXT: li a3, 195 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 16 ; CHECK-RV64-NEXT: bnez a2, .LBB61_717 ; CHECK-RV64-NEXT: j .LBB61_203 ; CHECK-RV64-NEXT: .LBB61_717: # %cond.load781 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 197 ; CHECK-RV64-NEXT: li a3, 196 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 32 ; CHECK-RV64-NEXT: bnez a2, .LBB61_718 ; CHECK-RV64-NEXT: j .LBB61_204 ; CHECK-RV64-NEXT: .LBB61_718: # %cond.load785 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 198 ; CHECK-RV64-NEXT: li a3, 197 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 64 ; CHECK-RV64-NEXT: bnez a2, .LBB61_719 ; CHECK-RV64-NEXT: j .LBB61_205 ; CHECK-RV64-NEXT: .LBB61_719: # %cond.load789 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 199 ; CHECK-RV64-NEXT: li a3, 198 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 128 ; CHECK-RV64-NEXT: bnez a2, .LBB61_720 ; CHECK-RV64-NEXT: j .LBB61_206 ; CHECK-RV64-NEXT: .LBB61_720: # %cond.load793 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 200 ; CHECK-RV64-NEXT: li a3, 199 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 256 ; CHECK-RV64-NEXT: bnez a2, .LBB61_721 ; CHECK-RV64-NEXT: j .LBB61_207 ; CHECK-RV64-NEXT: .LBB61_721: # %cond.load797 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 201 ; CHECK-RV64-NEXT: li a3, 200 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 512 ; CHECK-RV64-NEXT: bnez a2, .LBB61_722 ; CHECK-RV64-NEXT: j .LBB61_208 ; CHECK-RV64-NEXT: .LBB61_722: # %cond.load801 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 202 ; CHECK-RV64-NEXT: li a3, 201 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1024 ; CHECK-RV64-NEXT: bnez a2, .LBB61_723 ; CHECK-RV64-NEXT: j .LBB61_209 ; CHECK-RV64-NEXT: .LBB61_723: # %cond.load805 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 203 ; CHECK-RV64-NEXT: li a3, 202 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 52 ; CHECK-RV64-NEXT: bltz a2, .LBB61_724 ; CHECK-RV64-NEXT: j .LBB61_210 ; CHECK-RV64-NEXT: .LBB61_724: # %cond.load809 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 204 ; CHECK-RV64-NEXT: li a3, 203 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 51 ; CHECK-RV64-NEXT: bltz a2, .LBB61_725 ; CHECK-RV64-NEXT: j .LBB61_211 ; CHECK-RV64-NEXT: .LBB61_725: # %cond.load813 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 205 ; CHECK-RV64-NEXT: li a3, 204 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 50 ; CHECK-RV64-NEXT: bltz a2, .LBB61_726 ; CHECK-RV64-NEXT: j .LBB61_212 ; CHECK-RV64-NEXT: .LBB61_726: # %cond.load817 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 206 ; CHECK-RV64-NEXT: li a3, 205 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 49 ; CHECK-RV64-NEXT: bltz a2, .LBB61_727 ; CHECK-RV64-NEXT: j .LBB61_213 ; CHECK-RV64-NEXT: .LBB61_727: # %cond.load821 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 207 ; CHECK-RV64-NEXT: li a3, 206 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 48 ; CHECK-RV64-NEXT: bltz a2, .LBB61_728 ; CHECK-RV64-NEXT: j .LBB61_214 ; CHECK-RV64-NEXT: .LBB61_728: # %cond.load825 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 208 ; CHECK-RV64-NEXT: li a3, 207 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 47 ; CHECK-RV64-NEXT: bltz a2, .LBB61_729 ; CHECK-RV64-NEXT: j .LBB61_215 ; CHECK-RV64-NEXT: .LBB61_729: # %cond.load829 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 209 ; CHECK-RV64-NEXT: li a3, 208 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 46 ; CHECK-RV64-NEXT: bltz a2, .LBB61_730 ; CHECK-RV64-NEXT: j .LBB61_216 ; CHECK-RV64-NEXT: .LBB61_730: # %cond.load833 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 210 ; CHECK-RV64-NEXT: li a3, 209 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 45 ; CHECK-RV64-NEXT: bltz a2, .LBB61_731 ; CHECK-RV64-NEXT: j .LBB61_217 ; CHECK-RV64-NEXT: .LBB61_731: # %cond.load837 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 211 ; CHECK-RV64-NEXT: li a3, 210 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 44 ; CHECK-RV64-NEXT: bltz a2, .LBB61_732 ; CHECK-RV64-NEXT: j .LBB61_218 ; CHECK-RV64-NEXT: .LBB61_732: # %cond.load841 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 212 ; CHECK-RV64-NEXT: li a3, 211 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 43 ; CHECK-RV64-NEXT: bltz a2, .LBB61_733 ; CHECK-RV64-NEXT: j .LBB61_219 ; CHECK-RV64-NEXT: .LBB61_733: # %cond.load845 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 213 ; CHECK-RV64-NEXT: li a3, 212 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 42 ; CHECK-RV64-NEXT: bltz a2, .LBB61_734 ; CHECK-RV64-NEXT: j .LBB61_220 ; CHECK-RV64-NEXT: .LBB61_734: # %cond.load849 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 214 ; CHECK-RV64-NEXT: li a3, 213 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 41 ; CHECK-RV64-NEXT: bltz a2, .LBB61_735 ; CHECK-RV64-NEXT: j .LBB61_221 ; CHECK-RV64-NEXT: .LBB61_735: # %cond.load853 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 215 ; CHECK-RV64-NEXT: li a3, 214 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 40 ; CHECK-RV64-NEXT: bltz a2, .LBB61_736 ; CHECK-RV64-NEXT: j .LBB61_222 ; CHECK-RV64-NEXT: .LBB61_736: # %cond.load857 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 216 ; CHECK-RV64-NEXT: li a3, 215 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 39 ; CHECK-RV64-NEXT: bltz a2, .LBB61_737 ; CHECK-RV64-NEXT: j .LBB61_223 ; CHECK-RV64-NEXT: .LBB61_737: # %cond.load861 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 217 ; CHECK-RV64-NEXT: li a3, 216 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 38 ; CHECK-RV64-NEXT: bltz a2, .LBB61_738 ; CHECK-RV64-NEXT: j .LBB61_224 ; CHECK-RV64-NEXT: .LBB61_738: # %cond.load865 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 218 ; CHECK-RV64-NEXT: li a3, 217 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 37 ; CHECK-RV64-NEXT: bltz a2, .LBB61_739 ; CHECK-RV64-NEXT: j .LBB61_225 ; CHECK-RV64-NEXT: .LBB61_739: # %cond.load869 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 219 ; CHECK-RV64-NEXT: li a3, 218 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 36 ; CHECK-RV64-NEXT: bltz a2, .LBB61_740 ; CHECK-RV64-NEXT: j .LBB61_226 ; CHECK-RV64-NEXT: .LBB61_740: # %cond.load873 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 220 ; CHECK-RV64-NEXT: li a3, 219 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 35 ; CHECK-RV64-NEXT: bltz a2, .LBB61_741 ; CHECK-RV64-NEXT: j .LBB61_227 ; CHECK-RV64-NEXT: .LBB61_741: # %cond.load877 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 221 ; CHECK-RV64-NEXT: li a3, 220 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 34 ; CHECK-RV64-NEXT: bltz a2, .LBB61_742 ; CHECK-RV64-NEXT: j .LBB61_228 ; CHECK-RV64-NEXT: .LBB61_742: # %cond.load881 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 222 ; CHECK-RV64-NEXT: li a3, 221 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 33 ; CHECK-RV64-NEXT: bltz a2, .LBB61_743 ; CHECK-RV64-NEXT: j .LBB61_229 ; CHECK-RV64-NEXT: .LBB61_743: # %cond.load885 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 223 ; CHECK-RV64-NEXT: li a3, 222 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 32 ; CHECK-RV64-NEXT: bltz a2, .LBB61_744 ; CHECK-RV64-NEXT: j .LBB61_230 ; CHECK-RV64-NEXT: .LBB61_744: # %cond.load889 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 224 ; CHECK-RV64-NEXT: li a3, 223 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 31 ; CHECK-RV64-NEXT: bltz a2, .LBB61_745 ; CHECK-RV64-NEXT: j .LBB61_231 ; CHECK-RV64-NEXT: .LBB61_745: # %cond.load893 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 225 ; CHECK-RV64-NEXT: li a3, 224 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 30 ; CHECK-RV64-NEXT: bltz a2, .LBB61_746 ; CHECK-RV64-NEXT: j .LBB61_232 ; CHECK-RV64-NEXT: .LBB61_746: # %cond.load897 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 226 ; CHECK-RV64-NEXT: li a3, 225 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 29 ; CHECK-RV64-NEXT: bltz a2, .LBB61_747 ; CHECK-RV64-NEXT: j .LBB61_233 ; CHECK-RV64-NEXT: .LBB61_747: # %cond.load901 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 227 ; CHECK-RV64-NEXT: li a3, 226 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 28 ; CHECK-RV64-NEXT: bltz a2, .LBB61_748 ; CHECK-RV64-NEXT: j .LBB61_234 ; CHECK-RV64-NEXT: .LBB61_748: # %cond.load905 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 228 ; CHECK-RV64-NEXT: li a3, 227 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 27 ; CHECK-RV64-NEXT: bltz a2, .LBB61_749 ; CHECK-RV64-NEXT: j .LBB61_235 ; CHECK-RV64-NEXT: .LBB61_749: # %cond.load909 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 229 ; CHECK-RV64-NEXT: li a3, 228 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 26 ; CHECK-RV64-NEXT: bltz a2, .LBB61_750 ; CHECK-RV64-NEXT: j .LBB61_236 ; CHECK-RV64-NEXT: .LBB61_750: # %cond.load913 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 230 ; CHECK-RV64-NEXT: li a3, 229 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 25 ; CHECK-RV64-NEXT: bltz a2, .LBB61_751 ; CHECK-RV64-NEXT: j .LBB61_237 ; CHECK-RV64-NEXT: .LBB61_751: # %cond.load917 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 231 ; CHECK-RV64-NEXT: li a3, 230 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 24 ; CHECK-RV64-NEXT: bltz a2, .LBB61_752 ; CHECK-RV64-NEXT: j .LBB61_238 ; CHECK-RV64-NEXT: .LBB61_752: # %cond.load921 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 232 ; CHECK-RV64-NEXT: li a3, 231 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 23 ; CHECK-RV64-NEXT: bltz a2, .LBB61_753 ; CHECK-RV64-NEXT: j .LBB61_239 ; CHECK-RV64-NEXT: .LBB61_753: # %cond.load925 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 233 ; CHECK-RV64-NEXT: li a3, 232 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 22 ; CHECK-RV64-NEXT: bltz a2, .LBB61_754 ; CHECK-RV64-NEXT: j .LBB61_240 ; CHECK-RV64-NEXT: .LBB61_754: # %cond.load929 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 234 ; CHECK-RV64-NEXT: li a3, 233 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 21 ; CHECK-RV64-NEXT: bltz a2, .LBB61_755 ; CHECK-RV64-NEXT: j .LBB61_241 ; CHECK-RV64-NEXT: .LBB61_755: # %cond.load933 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 235 ; CHECK-RV64-NEXT: li a3, 234 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 20 ; CHECK-RV64-NEXT: bltz a2, .LBB61_756 ; CHECK-RV64-NEXT: j .LBB61_242 ; CHECK-RV64-NEXT: .LBB61_756: # %cond.load937 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 236 ; CHECK-RV64-NEXT: li a3, 235 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 19 ; CHECK-RV64-NEXT: bltz a2, .LBB61_757 ; CHECK-RV64-NEXT: j .LBB61_243 ; CHECK-RV64-NEXT: .LBB61_757: # %cond.load941 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 237 ; CHECK-RV64-NEXT: li a3, 236 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 18 ; CHECK-RV64-NEXT: bltz a2, .LBB61_758 ; CHECK-RV64-NEXT: j .LBB61_244 ; CHECK-RV64-NEXT: .LBB61_758: # %cond.load945 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 238 ; CHECK-RV64-NEXT: li a3, 237 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 17 ; CHECK-RV64-NEXT: bltz a2, .LBB61_759 ; CHECK-RV64-NEXT: j .LBB61_245 ; CHECK-RV64-NEXT: .LBB61_759: # %cond.load949 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 239 ; CHECK-RV64-NEXT: li a3, 238 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 16 ; CHECK-RV64-NEXT: bltz a2, .LBB61_760 ; CHECK-RV64-NEXT: j .LBB61_246 ; CHECK-RV64-NEXT: .LBB61_760: # %cond.load953 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 240 ; CHECK-RV64-NEXT: li a3, 239 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 15 ; CHECK-RV64-NEXT: bltz a2, .LBB61_761 ; CHECK-RV64-NEXT: j .LBB61_247 ; CHECK-RV64-NEXT: .LBB61_761: # %cond.load957 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 241 ; CHECK-RV64-NEXT: li a3, 240 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 14 ; CHECK-RV64-NEXT: bltz a2, .LBB61_762 ; CHECK-RV64-NEXT: j .LBB61_248 ; CHECK-RV64-NEXT: .LBB61_762: # %cond.load961 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 242 ; CHECK-RV64-NEXT: li a3, 241 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 13 ; CHECK-RV64-NEXT: bltz a2, .LBB61_763 ; CHECK-RV64-NEXT: j .LBB61_249 ; CHECK-RV64-NEXT: .LBB61_763: # %cond.load965 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 243 ; CHECK-RV64-NEXT: li a3, 242 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 12 ; CHECK-RV64-NEXT: bltz a2, .LBB61_764 ; CHECK-RV64-NEXT: j .LBB61_250 ; CHECK-RV64-NEXT: .LBB61_764: # %cond.load969 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 244 ; CHECK-RV64-NEXT: li a3, 243 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 11 ; CHECK-RV64-NEXT: bltz a2, .LBB61_765 ; CHECK-RV64-NEXT: j .LBB61_251 ; CHECK-RV64-NEXT: .LBB61_765: # %cond.load973 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 245 ; CHECK-RV64-NEXT: li a3, 244 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 10 ; CHECK-RV64-NEXT: bltz a2, .LBB61_766 ; CHECK-RV64-NEXT: j .LBB61_252 ; CHECK-RV64-NEXT: .LBB61_766: # %cond.load977 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 246 ; CHECK-RV64-NEXT: li a3, 245 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 9 ; CHECK-RV64-NEXT: bltz a2, .LBB61_767 ; CHECK-RV64-NEXT: j .LBB61_253 ; CHECK-RV64-NEXT: .LBB61_767: # %cond.load981 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 247 ; CHECK-RV64-NEXT: li a3, 246 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 8 ; CHECK-RV64-NEXT: bltz a2, .LBB61_768 ; CHECK-RV64-NEXT: j .LBB61_254 ; CHECK-RV64-NEXT: .LBB61_768: # %cond.load985 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 248 ; CHECK-RV64-NEXT: li a3, 247 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 7 ; CHECK-RV64-NEXT: bltz a2, .LBB61_769 ; CHECK-RV64-NEXT: j .LBB61_255 ; CHECK-RV64-NEXT: .LBB61_769: # %cond.load989 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 249 ; CHECK-RV64-NEXT: li a3, 248 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 6 ; CHECK-RV64-NEXT: bltz a2, .LBB61_770 ; CHECK-RV64-NEXT: j .LBB61_256 ; CHECK-RV64-NEXT: .LBB61_770: # %cond.load993 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 250 ; CHECK-RV64-NEXT: li a3, 249 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 5 ; CHECK-RV64-NEXT: bltz a2, .LBB61_771 ; CHECK-RV64-NEXT: j .LBB61_257 ; CHECK-RV64-NEXT: .LBB61_771: # %cond.load997 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 251 ; CHECK-RV64-NEXT: li a3, 250 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 4 ; CHECK-RV64-NEXT: bltz a2, .LBB61_772 ; CHECK-RV64-NEXT: j .LBB61_258 ; CHECK-RV64-NEXT: .LBB61_772: # %cond.load1001 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 252 ; CHECK-RV64-NEXT: li a3, 251 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 3 ; CHECK-RV64-NEXT: bltz a2, .LBB61_773 ; CHECK-RV64-NEXT: j .LBB61_259 ; CHECK-RV64-NEXT: .LBB61_773: # %cond.load1005 ; CHECK-RV64-NEXT: lbu a2, 0(a0) ; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-RV64-NEXT: vmv.s.x v16, a2 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a2 ; CHECK-RV64-NEXT: li a2, 253 ; CHECK-RV64-NEXT: li a3, 252 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 2 ; CHECK-RV64-NEXT: bgez a2, .LBB61_1028 ; CHECK-RV64-NEXT: j .LBB61_260 @@ -16925,15 +16778,15 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: j .LBB61_261 ; CHECK-RV64-NEXT: .LBB61_774: # %cond.load1017 ; CHECK-RV64-NEXT: lbu a1, 0(a0) -; CHECK-RV64-NEXT: vmv.s.x v16, a1 -; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vmv.s.x v12, a1 ; CHECK-RV64-NEXT: li a1, 256 ; CHECK-RV64-NEXT: li a3, 255 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 -; CHECK-RV64-NEXT: vmv4r.v v24, v8 -; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1 ; CHECK-RV64-NEXT: bnez a1, .LBB61_775 ; CHECK-RV64-NEXT: j .LBB61_265 diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll index 8e9751502460e..869478a1efa78 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -396,8 +396,8 @@ define @extract_nxv64i1_nxv2i1_2( %mask) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 @@ -421,8 +421,8 @@ define @extract_nxv4i1_nxv32i1_4( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll index 65f22370d729a..d60ce408278da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll @@ -1216,8 +1216,8 @@ define float @extractelt_fadd_nxv4f32_splat( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 263168 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fadd.s fa0, fa5, fa4 ; CHECK-NEXT: ret @@ -1231,8 +1231,8 @@ define float @extractelt_fsub_nxv4f32_splat( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 263168 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fsub.s fa0, fa4, fa5 ; CHECK-NEXT: ret @@ -1246,8 +1246,8 @@ define float @extractelt_fmul_nxv4f32_splat( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 263168 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fmul.s fa0, fa5, fa4 ; CHECK-NEXT: ret @@ -1296,12 +1296,12 @@ define double @extractelt_nxv16f64_neg1( %v) { ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vs8r.v v8, (a0) ; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: vs8r.v v8, (a0) ; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: vs8r.v v16, (a2) -; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: fld fa0, -8(a0) ; RV32-NEXT: addi sp, s0, -80 @@ -1329,13 +1329,13 @@ define double @extractelt_nxv16f64_neg1( %v) { ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vs8r.v v8, (a0) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a1, a2, 3 -; RV64-NEXT: add a3, a0, a1 ; RV64-NEXT: li a1, -1 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: slli a3, a2, 3 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: add a3, a0, a3 ; RV64-NEXT: addi a2, a2, -1 ; RV64-NEXT: vs8r.v v16, (a3) ; RV64-NEXT: bltu a2, a1, .LBB70_2 @@ -1393,9 +1393,9 @@ define double @extractelt_nxv16f64_idx( %v, i32 zeroext %i ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: addi a2, sp, 64 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a0, a2, a0 ; RV32-NEXT: vs8r.v v8, (a2) -; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: vs8r.v v16, (a1) ; RV32-NEXT: fld fa0, 0(a0) @@ -1432,9 +1432,9 @@ define double @extractelt_nxv16f64_idx( %v, i32 zeroext %i ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: addi a2, sp, 64 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a0, a2, a0 ; RV64-NEXT: vs8r.v v8, (a2) -; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: vs8r.v v16, (a1) ; RV64-NEXT: fld fa0, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll index 14719e190a693..796f8dde58f47 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll @@ -139,22 +139,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: addi a3, sp, 64 +; RV32-NEXT: vl8r.v v8, (a0) ; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vl8r.v v16, (a3) +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: vl8r.v v24, (a0) -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: vsetvli a3, zero, e8, m8, ta, ma -; RV32-NEXT: vmseq.vi v8, v16, 0 -; RV32-NEXT: vmseq.vi v0, v24, 0 +; RV32-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; RV32-NEXT: vmseq.vi v0, v8, 0 ; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: vmseq.vi v8, v24, 0 ; RV32-NEXT: vmerge.vim v24, v16, 1, v0 -; RV32-NEXT: vs8r.v v24, (a0) -; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vs8r.v v24, (a3) ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vmerge.vim v8, v16, 1, v0 -; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vs8r.v v8, (a2) ; RV32-NEXT: lbu a0, 0(a1) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload @@ -179,22 +179,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: addi a3, sp, 64 +; RV64-NEXT: vl8r.v v8, (a0) ; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vl8r.v v16, (a3) +; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: vl8r.v v24, (a0) -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: vsetvli a3, zero, e8, m8, ta, ma -; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vmseq.vi v0, v24, 0 +; RV64-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; RV64-NEXT: vmseq.vi v0, v8, 0 ; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: vmseq.vi v8, v24, 0 ; RV64-NEXT: vmerge.vim v24, v16, 1, v0 -; RV64-NEXT: vs8r.v v24, (a0) -; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: vs8r.v v24, (a3) ; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: vmerge.vim v8, v16, 1, v0 -; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vs8r.v v8, (a2) ; RV64-NEXT: lbu a0, 0(a1) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll index e6263ec9f0004..1474c73dacfc8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll @@ -613,8 +613,8 @@ define i64 @extractelt_nxv1i64_idx( %v, i32 %idx) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret @@ -654,8 +654,8 @@ define i64 @extractelt_nxv2i64_idx( %v, i32 %idx) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret @@ -695,8 +695,8 @@ define i64 @extractelt_nxv4i64_idx( %v, i32 %idx) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret @@ -736,8 +736,8 @@ define i64 @extractelt_nxv8i64_idx( %v, i32 %idx) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret @@ -876,12 +876,12 @@ define i32 @extractelt_nxv32i32_neg1( %v) { ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: addi a0, sp, 64 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: vs8r.v v16, (a2) -; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: lw a0, -4(a0) ; CHECK-NEXT: addi sp, s0, -80 @@ -932,9 +932,9 @@ define i32 @extractelt_nxv32i32_idx( %v, i32 %idx) { ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a2, sp, 64 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a0, a2, a0 ; CHECK-NEXT: vs8r.v v8, (a2) -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vs8r.v v16, (a1) ; CHECK-NEXT: lw a0, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll index d5c2b9e484206..a9e129ef11a2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll @@ -862,13 +862,13 @@ define i64 @extractelt_nxv16i64_neg1( %v) { ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: addi a0, sp, 64 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a1, a2, 3 -; CHECK-NEXT: add a3, a0, a1 ; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: slli a3, a2, 3 ; CHECK-NEXT: srli a1, a1, 32 ; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: vs8r.v v16, (a3) ; CHECK-NEXT: bltu a2, a1, .LBB74_2 @@ -926,9 +926,9 @@ define i64 @extractelt_nxv16i64_idx( %v, i32 zeroext %idx) { ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a2, sp, 64 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a0, a2, a0 ; CHECK-NEXT: vs8r.v v8, (a2) -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vs8r.v v16, (a1) ; CHECK-NEXT: ld a0, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll index 4e549a5aa7c3a..1626b362fed15 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll @@ -147,10 +147,10 @@ define @ceil_nxv1f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -170,10 +170,10 @@ define @ceil_nxv2f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -193,10 +193,10 @@ define @ceil_nxv4f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -216,10 +216,10 @@ define @ceil_nxv8f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -239,10 +239,10 @@ define @ceil_nxv16f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll index 35936574e8fe2..4aca2d694dfbb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll @@ -17,9 +17,9 @@ define @ceil_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -40,9 +40,9 @@ define @ceil_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -63,9 +63,9 @@ define @ceil_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -86,9 +86,9 @@ define @ceil_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -109,9 +109,9 @@ define @ceil_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -132,9 +132,9 @@ define @ceil_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -144,19 +144,21 @@ define @ceil_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %a = call @llvm.ceil.nxv32bf16( %x) ret %a @@ -182,9 +184,9 @@ define @ceil_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -221,9 +223,9 @@ define @ceil_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -260,9 +262,9 @@ define @ceil_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -299,9 +301,9 @@ define @ceil_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -338,9 +340,9 @@ define @ceil_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -377,9 +379,9 @@ define @ceil_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -389,19 +391,21 @@ define @ceil_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: fsrmi a0, 3 -; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv32f16( %x) ret %a diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll index f6b47743d1154..d93f15ec44053 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll @@ -147,10 +147,10 @@ define @floor_nxv1f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -170,10 +170,10 @@ define @floor_nxv2f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -193,10 +193,10 @@ define @floor_nxv4f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -216,10 +216,10 @@ define @floor_nxv8f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -239,10 +239,10 @@ define @floor_nxv16f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index d26b74c7c139e..010d7786c8891 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -17,9 +17,9 @@ define @floor_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -41,9 +41,9 @@ define @floor_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -65,9 +65,9 @@ define @floor_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -89,9 +89,9 @@ define @floor_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -113,9 +113,9 @@ define @floor_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -137,9 +137,9 @@ define @floor_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -149,19 +149,21 @@ define @floor_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %a = call @llvm.floor.nxv32bf16( %x) ret %a @@ -188,9 +190,9 @@ define @floor_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -227,9 +229,9 @@ define @floor_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -266,9 +268,9 @@ define @floor_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -305,9 +307,9 @@ define @floor_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -344,9 +346,9 @@ define @floor_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -383,9 +385,9 @@ define @floor_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -395,19 +397,21 @@ define @floor_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: fsrmi a0, 2 -; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv32f16( %x) ret %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll index a4e7bb2f31048..ce83e2d8a6220 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll @@ -19,30 +19,30 @@ define <512 x i8> @single_source(<512 x i8> %a) { ; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a0, 512 ; CHECK-NEXT: addi a1, sp, 512 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv.x.s a2, v16 +; CHECK-NEXT: vslidedown.vi v24, v16, 5 +; CHECK-NEXT: li a3, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: lbu a0, 770(sp) -; CHECK-NEXT: vmv.x.s a1, v16 -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 5 -; CHECK-NEXT: li a0, 432 ; CHECK-NEXT: li a1, 431 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: lbu a0, 1012(sp) +; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v24, a1 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v16, 4 -; CHECK-NEXT: li a0, 466 -; CHECK-NEXT: li a1, 465 -; CHECK-NEXT: lbu a2, 1012(sp) -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v16, a1 -; CHECK-NEXT: vmv.s.x v16, a2 -; CHECK-NEXT: li a0, 501 -; CHECK-NEXT: li a1, 500 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslidedown.vi v24, v16, 4 +; CHECK-NEXT: li a1, 466 +; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: li a0, 465 +; CHECK-NEXT: li a2, 501 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v24, a0 +; CHECK-NEXT: li a0, 500 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v16, a0 ; CHECK-NEXT: addi sp, s0, -1536 ; CHECK-NEXT: .cfi_def_cfa sp, 1536 ; CHECK-NEXT: ld ra, 1528(sp) # 8-byte Folded Reload @@ -61,28 +61,28 @@ define <512 x i8> @range_restriction(<512 x i8> %a) { ; CHECK-LABEL: range_restriction: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 512 +; CHECK-NEXT: li a1, 254 +; CHECK-NEXT: li a2, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: li a1, 254 ; CHECK-NEXT: vslide1down.vx v24, v16, a1 +; CHECK-NEXT: li a1, 431 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v16, 5 -; CHECK-NEXT: li a1, 432 -; CHECK-NEXT: li a2, 431 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v24, v16, a2 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v16, 4 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v24, v16, a1 ; CHECK-NEXT: li a1, 466 ; CHECK-NEXT: li a2, 465 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v16, 4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v24, v16, a2 ; CHECK-NEXT: li a1, 44 +; CHECK-NEXT: li a2, 501 ; CHECK-NEXT: vmv.s.x v16, a1 -; CHECK-NEXT: li a1, 501 -; CHECK-NEXT: li a2, 500 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v24, v16, a2 +; CHECK-NEXT: li a1, 500 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v24, v16, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vrgather.vv v16, v8, v24 ; CHECK-NEXT: vmv.v.v v8, v16 @@ -107,53 +107,53 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) { ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: li a0, 512 ; CHECK-NEXT: addi a1, sp, 512 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v24, 5 -; CHECK-NEXT: vmv.x.s a1, v24 +; CHECK-NEXT: vmv.x.s a2, v24 +; CHECK-NEXT: li a3, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: li a1, 432 +; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: li a2, 431 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v0, a2 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v24, 4 -; CHECK-NEXT: li a1, 466 -; CHECK-NEXT: li a2, 465 -; CHECK-NEXT: lbu a3, 985(sp) -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a2 -; CHECK-NEXT: lbu a1, 1012(sp) -; CHECK-NEXT: vmv.s.x v24, a3 -; CHECK-NEXT: li a2, 478 -; CHECK-NEXT: li a3, 477 +; CHECK-NEXT: vslidedown.vi v0, v24, 4 +; CHECK-NEXT: li a2, 466 +; CHECK-NEXT: li a3, 465 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vse8.v v24, (a1) +; CHECK-NEXT: lbu a1, 985(sp) ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a3 +; CHECK-NEXT: vslideup.vx v8, v0, a3 +; CHECK-NEXT: li a2, 478 +; CHECK-NEXT: lbu a3, 1012(sp) ; CHECK-NEXT: vmv.s.x v24, a1 +; CHECK-NEXT: li a1, 477 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v24, a1 ; CHECK-NEXT: li a1, 501 +; CHECK-NEXT: vmv.s.x v24, a3 ; CHECK-NEXT: li a2, 500 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v24, a2 ; CHECK-NEXT: lui a1, 2761 -; CHECK-NEXT: slli a1, a1, 25 -; CHECK-NEXT: addi a1, a1, 501 -; CHECK-NEXT: slli a1, a1, 13 -; CHECK-NEXT: addi a1, a1, 512 ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: lui a2, 4 +; CHECK-NEXT: vmv.s.x v25, a2 ; CHECK-NEXT: lui a2, 1047552 ; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: slli a2, a2, 23 ; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: slli a2, a2, 18 ; CHECK-NEXT: vslide1down.vx v0, v24, a2 -; CHECK-NEXT: lui a2, 4 -; CHECK-NEXT: vmv.s.x v24, a2 ; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: slli a1, a1, 25 +; CHECK-NEXT: addi a1, a1, 501 +; CHECK-NEXT: slli a1, a1, 13 +; CHECK-NEXT: addi a1, a1, 512 ; CHECK-NEXT: vsetivli zero, 7, e64, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v24, 6 +; CHECK-NEXT: vslideup.vi v0, v25, 6 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vmv.v.x v24, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll index 2808ca3fd2621..bfc43db2e369e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll @@ -6,11 +6,11 @@ define <1 x i1> @v1i1(i1 %x, i1 %y) { ; CHECK-LABEL: v1i1: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vmxor.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -26,11 +26,11 @@ define <2 x i1> @v2i1(i1 %x, i1 %y) { ; CHECK-LABEL: v2i1: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vmxor.mm v0, v8, v9 ; CHECK-NEXT: vmv.v.i v8, 0 @@ -50,11 +50,11 @@ define <4 x i1> @v4i1(i1 %x, i1 %y) { ; CHECK-LABEL: v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vmxor.mm v0, v8, v9 ; CHECK-NEXT: vmv.v.i v8, 0 @@ -74,11 +74,11 @@ define <8 x i1> @v8i1(i1 %x, i1 %y) { ; CHECK-LABEL: v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vmxor.mm v0, v8, v9 ; CHECK-NEXT: vmv.v.i v8, 0 @@ -98,11 +98,11 @@ define <16 x i1> @v16i1(i1 %x, i1 %y) { ; CHECK-LABEL: v16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vmxor.mm v0, v8, v9 ; CHECK-NEXT: vmv.v.i v8, 0 @@ -123,10 +123,10 @@ define <32 x i1> @v32i1(i1 %x, i1 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vmsne.vi v10, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vmsne.vi v11, v8, 0 ; CHECK-NEXT: vmxor.mm v0, v10, v11 @@ -148,10 +148,10 @@ define <64 x i1> @v64i1(i1 %x, i1 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vmsne.vi v12, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vmsne.vi v13, v8, 0 ; CHECK-NEXT: vmxor.mm v0, v12, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll index c42fabd78aabf..3eb5d36b4896a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -11,18 +11,18 @@ define <2 x i8> @vp_bitreverse_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15, v0.t +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -37,20 +37,20 @@ define <2 x i8> @vp_bitreverse_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -65,18 +65,18 @@ define <4 x i8> @vp_bitreverse_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15, v0.t +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -91,20 +91,20 @@ define <4 x i8> @vp_bitreverse_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -119,18 +119,18 @@ define <8 x i8> @vp_bitreverse_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15, v0.t +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -145,20 +145,20 @@ define <8 x i8> @vp_bitreverse_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -173,18 +173,18 @@ define <16 x i8> @vp_bitreverse_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15, v0.t +; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: li a0, 51 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -199,20 +199,20 @@ define <16 x i8> @vp_bitreverse_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v9, v8, 15 -; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -227,25 +227,25 @@ define <2 x i16> @vp_bitreverse_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -261,26 +261,26 @@ define <2 x i16> @vp_bitreverse_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -295,25 +295,25 @@ define <4 x i16> @vp_bitreverse_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -329,26 +329,26 @@ define <4 x i16> @vp_bitreverse_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -363,25 +363,25 @@ define <8 x i16> @vp_bitreverse_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -397,26 +397,26 @@ define <8 x i16> @vp_bitreverse_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -431,25 +431,25 @@ define <16 x i16> @vp_bitreverse_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t +; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -465,26 +465,26 @@ define <16 x i16> @vp_bitreverse_v16i16_unmasked(<16 x i16> %va, i32 zeroext %ev ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 3 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 3 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 5 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -505,27 +505,27 @@ define <2 x i32> @vp_bitreverse_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t ; CHECK-NEXT: vor.vv v9, v9, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -541,34 +541,34 @@ define <2 x i32> @vp_bitreverse_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -589,27 +589,27 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t ; CHECK-NEXT: vor.vv v9, v9, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -625,34 +625,34 @@ define <4 x i32> @vp_bitreverse_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vsll.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret @@ -673,27 +673,27 @@ define <8 x i32> @vp_bitreverse_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t ; CHECK-NEXT: vor.vv v10, v10, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vsll.vi v12, v12, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -709,34 +709,34 @@ define <8 x i32> @vp_bitreverse_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 -; CHECK-NEXT: vand.vx v12, v8, a0 -; CHECK-NEXT: vsll.vi v12, v12, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vsll.vi v12, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret @@ -757,27 +757,27 @@ define <16 x i32> @vp_bitreverse_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroex ; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t ; CHECK-NEXT: vor.vv v12, v12, v16, v0.t ; CHECK-NEXT: vand.vx v16, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vsll.vi v16, v16, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t @@ -793,34 +793,34 @@ define <16 x i32> @vp_bitreverse_v16i32_unmasked(<16 x i32> %va, i32 zeroext %ev ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 -; CHECK-NEXT: vand.vx v16, v8, a0 -; CHECK-NEXT: vsll.vi v16, v16, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vsll.vi v16, v8, 24 +; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 209715 +; CHECK-NEXT: addi a0, a0, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret @@ -835,68 +835,67 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: lui a5, 16 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: sw a4, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3, v0.t -; RV32-NEXT: vor.vv v9, v9, v10, v0.t -; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v9, (a6), zero +; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4, v0.t +; RV32-NEXT: vsll.vx v10, v8, a3, v0.t +; RV32-NEXT: addi a5, a5, -256 +; RV32-NEXT: vand.vx v11, v8, a5, v0.t +; RV32-NEXT: vsll.vx v11, v11, a2, v0.t +; RV32-NEXT: vor.vv v10, v10, v11, v0.t +; RV32-NEXT: vand.vx v11, v8, a1, v0.t ; RV32-NEXT: vsll.vi v11, v11, 24, v0.t -; RV32-NEXT: vand.vv v12, v8, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v9, v0.t ; RV32-NEXT: vsll.vi v12, v12, 8, v0.t ; RV32-NEXT: vor.vv v11, v11, v12, v0.t -; RV32-NEXT: vor.vv v9, v9, v11, v0.t -; RV32-NEXT: vsrl.vx v11, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t -; RV32-NEXT: vand.vx v12, v12, a2, v0.t +; RV32-NEXT: vor.vv v10, v10, v11, v0.t +; RV32-NEXT: vsrl.vx v11, v8, a3, v0.t +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: vsrl.vx v12, v8, a2, v0.t +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: addi a4, a4, -241 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vand.vx v12, v12, a5, v0.t ; RV32-NEXT: vor.vv v11, v12, v11, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t -; RV32-NEXT: vand.vx v12, v12, a4, v0.t +; RV32-NEXT: vand.vx v12, v12, a1, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a4 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v11, v0.t -; RV32-NEXT: vor.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v11, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vand.vv v10, v10, v9, v0.t +; RV32-NEXT: vand.vv v8, v8, v9, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v9, v9, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v9, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v9, v9, v11, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v9, v8, v0.t ; RV32-NEXT: addi sp, sp, 16 @@ -906,59 +905,59 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV64-LABEL: vp_bitreverse_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a3, 255 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 209715 +; RV64-NEXT: lui a7, 349525 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 819 +; RV64-NEXT: addiw a7, a7, 1365 +; RV64-NEXT: slli t0, a5, 32 +; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: slli a5, a6, 32 +; RV64-NEXT: add a6, a6, a5 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: add a5, a7, a5 +; RV64-NEXT: li a7, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vand.vx v9, v8, a1, v0.t +; RV64-NEXT: slli a3, a3, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v9, v9, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a3, v0.t ; RV64-NEXT: vsll.vi v10, v10, 8, v0.t ; RV64-NEXT: vor.vv v9, v9, v10, v0.t -; RV64-NEXT: li a2, 56 ; RV64-NEXT: vsll.vx v10, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4, v0.t +; RV64-NEXT: vand.vx v11, v8, a0, v0.t +; RV64-NEXT: vsll.vx v11, v11, a7, v0.t ; RV64-NEXT: vor.vv v10, v10, v11, v0.t ; RV64-NEXT: vor.vv v9, v10, v9, v0.t ; RV64-NEXT: vsrl.vx v10, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v11, v8, a4, v0.t -; RV64-NEXT: vand.vx v11, v11, a3, v0.t +; RV64-NEXT: vsrl.vx v11, v8, a7, v0.t +; RV64-NEXT: vand.vx v11, v11, a0, v0.t ; RV64-NEXT: vor.vv v10, v11, v10, v0.t ; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t ; RV64-NEXT: vand.vx v11, v11, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: vor.vv v8, v8, v11, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v9, t0, v0.t +; RV64-NEXT: vand.vx v8, v8, t0, v0.t ; RV64-NEXT: vsll.vi v8, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v9, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a6, v0.t ; RV64-NEXT: vsll.vi v8, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v9, a5, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vsll.vi v8, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t ; RV64-NEXT: ret @@ -972,67 +971,67 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v9, v10 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vand.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vor.vv v11, v12, v11 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4 -; RV32-NEXT: vsll.vi v11, v11, 24 -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v9, v11 -; RV32-NEXT: vsrl.vx v11, v8, a1 -; RV32-NEXT: vsrl.vx v12, v8, a3 -; RV32-NEXT: vand.vx v12, v12, a2 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v11 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v10, v10, v13 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vv v13, v13, v12 +; RV32-NEXT: vor.vv v9, v13, v9 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1042,59 +1041,59 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV64-LABEL: vp_bitreverse_v2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsll.vi v9, v9, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vor.vv v9, v9, v10 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v10, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4 -; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vsrl.vi v9, v8, 24 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v11, v8, a3 +; RV64-NEXT: vsrl.vx v12, v8, a5 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vx v10, v8, a2 -; RV64-NEXT: vsrl.vx v11, v8, a4 -; RV64-NEXT: vand.vx v11, v11, a3 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsrl.vi v11, v8, 24 -; RV64-NEXT: vand.vx v11, v11, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v11 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v9, v9, v11 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v9, v9, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: ret @@ -1109,68 +1108,67 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: lui a5, 16 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: sw a4, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3, v0.t -; RV32-NEXT: vor.vv v10, v10, v12, v0.t -; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v10, (a6), zero +; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4, v0.t +; RV32-NEXT: vsll.vx v12, v8, a3, v0.t +; RV32-NEXT: addi a5, a5, -256 +; RV32-NEXT: vand.vx v14, v8, a5, v0.t +; RV32-NEXT: vsll.vx v14, v14, a2, v0.t +; RV32-NEXT: vor.vv v12, v12, v14, v0.t +; RV32-NEXT: vand.vx v14, v8, a1, v0.t ; RV32-NEXT: vsll.vi v14, v14, 24, v0.t -; RV32-NEXT: vand.vv v16, v8, v12, v0.t +; RV32-NEXT: vand.vv v16, v8, v10, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t ; RV32-NEXT: vor.vv v14, v14, v16, v0.t -; RV32-NEXT: vor.vv v10, v10, v14, v0.t -; RV32-NEXT: vsrl.vx v14, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v12, v12, v14, v0.t +; RV32-NEXT: vsrl.vx v14, v8, a3, v0.t +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: addi a4, a4, -241 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vand.vx v16, v16, a5, v0.t ; RV32-NEXT: vor.vv v14, v16, v14, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t -; RV32-NEXT: vand.vx v16, v16, a4, v0.t +; RV32-NEXT: vand.vx v16, v16, a1, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a4 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v14, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v14, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vand.vv v12, v12, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vor.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v10, v10, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v10, v10, v14, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v10, v8, v0.t ; RV32-NEXT: addi sp, sp, 16 @@ -1180,59 +1178,59 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV64-LABEL: vp_bitreverse_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a3, 255 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 209715 +; RV64-NEXT: lui a7, 349525 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 819 +; RV64-NEXT: addiw a7, a7, 1365 +; RV64-NEXT: slli t0, a5, 32 +; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: slli a5, a6, 32 +; RV64-NEXT: add a6, a6, a5 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: add a5, a7, a5 +; RV64-NEXT: li a7, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vand.vx v10, v8, a1, v0.t +; RV64-NEXT: slli a3, a3, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v10, v10, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a3, v0.t ; RV64-NEXT: vsll.vi v12, v12, 8, v0.t ; RV64-NEXT: vor.vv v10, v10, v12, v0.t -; RV64-NEXT: li a2, 56 ; RV64-NEXT: vsll.vx v12, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4, v0.t +; RV64-NEXT: vand.vx v14, v8, a0, v0.t +; RV64-NEXT: vsll.vx v14, v14, a7, v0.t ; RV64-NEXT: vor.vv v12, v12, v14, v0.t ; RV64-NEXT: vor.vv v10, v12, v10, v0.t ; RV64-NEXT: vsrl.vx v12, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v14, v8, a4, v0.t -; RV64-NEXT: vand.vx v14, v14, a3, v0.t +; RV64-NEXT: vsrl.vx v14, v8, a7, v0.t +; RV64-NEXT: vand.vx v14, v14, a0, v0.t ; RV64-NEXT: vor.vv v12, v14, v12, v0.t ; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t ; RV64-NEXT: vand.vx v14, v14, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: vor.vv v8, v8, v14, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v10, t0, v0.t +; RV64-NEXT: vand.vx v8, v8, t0, v0.t ; RV64-NEXT: vsll.vi v8, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v10, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a6, v0.t ; RV64-NEXT: vsll.vi v8, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v10, a5, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vsll.vi v8, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t ; RV64-NEXT: ret @@ -1246,67 +1244,67 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v14, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vand.vx v18, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vor.vv v10, v16, v10 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4 -; RV32-NEXT: vsll.vi v14, v14, 24 -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v10, v14 -; RV32-NEXT: vsrl.vx v14, v8, a1 -; RV32-NEXT: vsrl.vx v16, v8, a3 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v14 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vsll.vx v18, v18, a4 +; RV32-NEXT: vor.vv v12, v12, v18 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v14, v14, a5 +; RV32-NEXT: vand.vv v18, v18, v16 +; RV32-NEXT: vor.vv v14, v18, v14 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vor.vv v10, v14, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v14, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v10, v10, v16 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vand.vv v10, v10, v14 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1316,59 +1314,59 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV64-LABEL: vp_bitreverse_v4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vor.vv v10, v10, v12 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v12, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4 -; RV64-NEXT: vor.vv v12, v12, v14 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vx v12, v8, a2 -; RV64-NEXT: vsrl.vx v14, v8, a4 -; RV64-NEXT: vand.vx v14, v14, a3 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsrl.vi v14, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v10, v8, a3 +; RV64-NEXT: vsrl.vx v16, v8, a5 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vor.vv v10, v16, v10 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: vand.vx v14, v14, a2 ; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsrl.vi v14, v8, 24 -; RV64-NEXT: vand.vx v14, v14, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v14, v8, a2 +; RV64-NEXT: vsll.vi v14, v14, 8 +; RV64-NEXT: vor.vv v14, v16, v14 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v14 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: ret @@ -1383,70 +1381,69 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: lui a5, 16 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: sw a4, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3, v0.t -; RV32-NEXT: vor.vv v16, v12, v16, v0.t -; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v12, (a6), zero +; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4, v0.t +; RV32-NEXT: vsll.vx v16, v8, a3, v0.t +; RV32-NEXT: addi a5, a5, -256 +; RV32-NEXT: vand.vx v20, v8, a5, v0.t +; RV32-NEXT: vsll.vx v20, v20, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vand.vx v20, v8, a1, v0.t ; RV32-NEXT: vsll.vi v20, v20, 24, v0.t ; RV32-NEXT: vand.vv v24, v8, v12, v0.t ; RV32-NEXT: vsll.vi v24, v24, 8, v0.t ; RV32-NEXT: vor.vv v20, v20, v24, v0.t ; RV32-NEXT: vor.vv v16, v16, v20, v0.t -; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t +; RV32-NEXT: vsrl.vx v20, v8, a3, v0.t +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV32-NEXT: lui a2, 349525 +; RV32-NEXT: addi a4, a4, -241 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: vand.vx v24, v24, a5, v0.t ; RV32-NEXT: vor.vv v20, v24, v20, v0.t ; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a1, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vor.vv v8, v8, v20, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v28, a4 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v12, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vor.vv v20, v8, v20, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vsrl.vi v20, v16, 4, v0.t +; RV32-NEXT: vand.vv v20, v20, v28, v0.t +; RV32-NEXT: vand.vv v16, v16, v28, v0.t +; RV32-NEXT: vsll.vi v16, v16, 4, v0.t +; RV32-NEXT: vor.vv v16, v20, v16, v0.t +; RV32-NEXT: vsrl.vi v20, v16, 2, v0.t +; RV32-NEXT: vand.vv v20, v20, v12, v0.t +; RV32-NEXT: vand.vv v12, v16, v12, v0.t +; RV32-NEXT: vsll.vi v12, v12, 2, v0.t +; RV32-NEXT: vor.vv v12, v20, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v12, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1454,59 +1451,59 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV64-LABEL: vp_bitreverse_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a3, 255 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 209715 +; RV64-NEXT: lui a7, 349525 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 819 +; RV64-NEXT: addiw a7, a7, 1365 +; RV64-NEXT: slli t0, a5, 32 +; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: slli a5, a6, 32 +; RV64-NEXT: add a6, a6, a5 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: add a5, a7, a5 +; RV64-NEXT: li a7, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vand.vx v12, v8, a1, v0.t +; RV64-NEXT: slli a3, a3, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v12, v12, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsll.vi v16, v16, 8, v0.t ; RV64-NEXT: vor.vv v12, v12, v16, v0.t -; RV64-NEXT: li a2, 56 ; RV64-NEXT: vsll.vx v16, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4, v0.t +; RV64-NEXT: vand.vx v20, v8, a0, v0.t +; RV64-NEXT: vsll.vx v20, v20, a7, v0.t ; RV64-NEXT: vor.vv v16, v16, v20, v0.t ; RV64-NEXT: vor.vv v12, v16, v12, v0.t ; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v20, v8, a4, v0.t -; RV64-NEXT: vand.vx v20, v20, a3, v0.t +; RV64-NEXT: vsrl.vx v20, v8, a7, v0.t +; RV64-NEXT: vand.vx v20, v20, a0, v0.t ; RV64-NEXT: vor.vv v16, v20, v16, v0.t ; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t ; RV64-NEXT: vand.vx v20, v20, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: vor.vv v8, v8, v20, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v12, t0, v0.t +; RV64-NEXT: vand.vx v8, v8, t0, v0.t ; RV64-NEXT: vsll.vi v8, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v12, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a6, v0.t ; RV64-NEXT: vsll.vi v8, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v12, a5, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vsll.vi v8, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t ; RV64-NEXT: ret @@ -1520,67 +1517,67 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v20, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vand.vx v28, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a1 +; RV32-NEXT: vor.vv v12, v24, v12 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4 -; RV32-NEXT: vsll.vi v20, v20, 24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vor.vv v12, v12, v20 -; RV32-NEXT: vsrl.vx v20, v8, a1 -; RV32-NEXT: vsrl.vx v24, v8, a3 -; RV32-NEXT: vand.vx v24, v24, a2 -; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v24, v8, 24 -; RV32-NEXT: vand.vx v24, v24, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v8, v20 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vsll.vx v28, v28, a4 +; RV32-NEXT: vor.vv v16, v16, v28 +; RV32-NEXT: vsrl.vi v28, v8, 8 +; RV32-NEXT: vand.vx v20, v20, a5 +; RV32-NEXT: vand.vv v28, v28, v24 +; RV32-NEXT: vor.vv v20, v28, v20 ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vor.vv v12, v20, v12 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v20, a2 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v12, v12, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vand.vv v12, v12, v20 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1590,59 +1587,59 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV64-LABEL: vp_bitreverse_v8i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsll.vi v16, v16, 8 -; RV64-NEXT: vor.vv v12, v12, v16 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v16, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4 -; RV64-NEXT: vor.vv v16, v16, v20 -; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vsrl.vx v16, v8, a2 -; RV64-NEXT: vsrl.vx v20, v8, a4 -; RV64-NEXT: vand.vx v20, v20, a3 +; RV64-NEXT: vsrl.vi v16, v8, 24 +; RV64-NEXT: vsrl.vi v20, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v12, v8, a3 +; RV64-NEXT: vsrl.vx v24, v8, a5 +; RV64-NEXT: vand.vx v24, v24, a0 +; RV64-NEXT: vor.vv v12, v24, v12 +; RV64-NEXT: vand.vx v24, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vsll.vi v24, v24, 24 +; RV64-NEXT: vand.vx v20, v20, a2 ; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vsrl.vi v20, v8, 24 -; RV64-NEXT: vand.vx v20, v20, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v20, v8, a2 +; RV64-NEXT: vsll.vi v20, v20, 8 +; RV64-NEXT: vor.vv v20, v24, v20 +; RV64-NEXT: vsll.vx v24, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v20 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vor.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v24, v8 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v12, v12, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v12, v12, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: ret @@ -1662,116 +1659,117 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw zero, 20(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: vmv8r.v v24, v8 +; RV32-NEXT: lui a2, 1044480 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: lui a4, 209715 +; RV32-NEXT: lui a5, 349525 ; RV32-NEXT: li a1, 56 +; RV32-NEXT: lui a6, 16 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: addi a2, a3, -241 +; RV32-NEXT: sw a2, 40(sp) +; RV32-NEXT: sw a2, 44(sp) +; RV32-NEXT: li a2, 40 +; RV32-NEXT: addi a3, a4, 819 +; RV32-NEXT: sw a3, 32(sp) +; RV32-NEXT: sw a3, 36(sp) +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: addi a4, a5, 1365 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a5, a6, -256 +; RV32-NEXT: sw a4, 24(sp) +; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: vand.vx v8, v8, a5, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t -; RV32-NEXT: addi a5, sp, 48 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 48 -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 48 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v24, a1, v0.t +; RV32-NEXT: vsrl.vx v8, v24, a2, v0.t +; RV32-NEXT: vand.vx v8, v8, a5, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t +; RV32-NEXT: vand.vx v16, v8, a3, v0.t +; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v16, v16, 4, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsll.vi v24, v24, 4, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v16, v16, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v24, v24, 2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -1790,66 +1788,65 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v16, v16, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vand.vx v24, v8, a2, v0.t ; RV64-NEXT: vsll.vi v24, v24, 8, v0.t ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vx v24, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vx v16, v16, a5, v0.t ; RV64-NEXT: vor.vv v16, v24, v16, v0.t -; RV64-NEXT: addi a5, sp, 16 -; RV64-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t +; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vor.vv v24, v16, v24, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t ; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vand.vx v8, v8, a0, v0.t ; RV64-NEXT: vsll.vi v8, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vsll.vi v8, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vsll.vi v8, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: csrr a0, vlenb @@ -1869,91 +1866,98 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: li a5, 56 +; RV32-NEXT: lui a6, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a5 +; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: li a5, 40 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a1, a2, -241 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: addi a4, a6, -256 +; RV32-NEXT: vsrl.vx v0, v8, a5 +; RV32-NEXT: sw a3, 24(sp) +; RV32-NEXT: sw a3, 28(sp) +; RV32-NEXT: vand.vx v0, v0, a4 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a4 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a5, sp, 48 -; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v0, v0, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vand.vv v24, v24, v16 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -1962,62 +1966,78 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; ; RV64-LABEL: vp_bitreverse_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v0, v0, v16 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsll.vx v8, v8, a5 ; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x i64> %v @@ -2035,116 +2055,117 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw zero, 20(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: vmv8r.v v24, v8 +; RV32-NEXT: lui a2, 1044480 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: lui a4, 209715 +; RV32-NEXT: lui a5, 349525 ; RV32-NEXT: li a1, 56 +; RV32-NEXT: lui a6, 16 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: addi a2, a3, -241 +; RV32-NEXT: sw a2, 40(sp) +; RV32-NEXT: sw a2, 44(sp) +; RV32-NEXT: li a2, 40 +; RV32-NEXT: addi a3, a4, 819 +; RV32-NEXT: sw a3, 32(sp) +; RV32-NEXT: sw a3, 36(sp) +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: addi a4, a5, 1365 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a5, a6, -256 +; RV32-NEXT: sw a4, 24(sp) +; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: vand.vx v8, v8, a5, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t -; RV32-NEXT: addi a5, sp, 48 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 48 -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 48 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v24, a1, v0.t +; RV32-NEXT: vsrl.vx v8, v24, a2, v0.t +; RV32-NEXT: vand.vx v8, v8, a5, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t +; RV32-NEXT: vand.vx v16, v8, a3, v0.t +; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v16, v16, 4, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsll.vi v24, v24, 4, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v16, v16, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v24, v24, 2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -2163,66 +2184,65 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v16, v16, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vand.vx v24, v8, a2, v0.t ; RV64-NEXT: vsll.vi v24, v24, 8, v0.t ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vx v24, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vx v16, v16, a5, v0.t ; RV64-NEXT: vor.vv v16, v24, v16, v0.t -; RV64-NEXT: addi a5, sp, 16 -; RV64-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t +; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vor.vv v24, v16, v24, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t ; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vand.vx v8, v8, a0, v0.t ; RV64-NEXT: vsll.vi v8, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vsll.vi v8, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vsll.vi v8, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t ; RV64-NEXT: csrr a0, vlenb @@ -2242,91 +2262,98 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: li a5, 56 +; RV32-NEXT: lui a6, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a5 +; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: li a5, 40 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a1, a2, -241 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: addi a4, a6, -256 +; RV32-NEXT: vsrl.vx v0, v8, a5 +; RV32-NEXT: sw a3, 24(sp) +; RV32-NEXT: sw a3, 28(sp) +; RV32-NEXT: vand.vx v0, v0, a4 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a4 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a5, sp, 48 -; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v0, v0, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vand.vv v24, v24, v16 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -2335,62 +2362,78 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; ; RV64-LABEL: vp_bitreverse_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v0, v0, v16 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsll.vx v8, v8, a5 ; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 349525 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.bitreverse.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x i64> %v @@ -2422,58 +2465,58 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze ; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: lui a2, 3 +; CHECK-NEXT: addi a3, a0, -64 +; CHECK-NEXT: sltu a0, a0, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a3, a0, a3 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: addi a4, a1, -241 +; CHECK-NEXT: addi a1, a2, 819 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vand.vx v8, v8, a1, v0.t +; CHECK-NEXT: vand.vx v16, v16, a4, v0.t +; CHECK-NEXT: vand.vx v8, v8, a4, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t -; CHECK-NEXT: lui a2, 3 -; CHECK-NEXT: addi a2, a2, 819 -; CHECK-NEXT: vand.vx v16, v16, a2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t -; CHECK-NEXT: lui a3, 5 -; CHECK-NEXT: addi a3, a3, 1365 -; CHECK-NEXT: vand.vx v16, v16, a3, v0.t -; CHECK-NEXT: vand.vx v8, v8, a3, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t +; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a4, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; CHECK-NEXT: addi a4, a0, -64 -; CHECK-NEXT: sltu a0, a0, a4 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a4 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 3 -; CHECK-NEXT: add a4, sp, a4 -; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vand.vx v8, v8, a1, v0.t +; CHECK-NEXT: vand.vx v16, v16, a4, v0.t +; CHECK-NEXT: vand.vx v8, v8, a4, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t -; CHECK-NEXT: vand.vx v16, v16, a2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vand.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t ; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t -; CHECK-NEXT: vand.vx v16, v16, a3, v0.t -; CHECK-NEXT: vand.vx v8, v8, a3, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t +; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vor.vv v16, v16, v8, v0.t ; CHECK-NEXT: addi a0, sp, 16 @@ -2501,49 +2544,53 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v24 -; CHECK-NEXT: vsrl.vi v24, v8, 4 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v24, v24, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v24, v8 -; CHECK-NEXT: vsrl.vi v24, v8, 2 -; CHECK-NEXT: lui a2, 3 -; CHECK-NEXT: addi a2, a2, 819 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v24, v8 -; CHECK-NEXT: vsrl.vi v24, v8, 1 -; CHECK-NEXT: lui a3, 5 -; CHECK-NEXT: addi a3, a3, 1365 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vand.vx v8, v8, a3 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: lui a3, 3 ; CHECK-NEXT: addi a4, a0, -64 ; CHECK-NEXT: sltu a0, a0, a4 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a4 +; CHECK-NEXT: lui a4, 5 +; CHECK-NEXT: vor.vv v8, v8, v24 +; CHECK-NEXT: addi a2, a2, -241 +; CHECK-NEXT: addi a3, a3, 819 +; CHECK-NEXT: addi a4, a4, 1365 +; CHECK-NEXT: vsrl.vi v24, v8, 4 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v24, v24, a2 +; CHECK-NEXT: vsll.vi v8, v8, 4 +; CHECK-NEXT: vor.vv v8, v24, v8 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 8 ; CHECK-NEXT: vsll.vi v16, v16, 8 ; CHECK-NEXT: vor.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a3 +; CHECK-NEXT: vand.vx v24, v24, a3 +; CHECK-NEXT: vsll.vi v8, v8, 2 +; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 4 -; CHECK-NEXT: vand.vx v24, v24, a1 -; CHECK-NEXT: vand.vx v16, v16, a1 +; CHECK-NEXT: vand.vx v16, v16, a2 +; CHECK-NEXT: vand.vx v24, v24, a2 ; CHECK-NEXT: vsll.vi v16, v16, 4 ; CHECK-NEXT: vor.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v8, 1 +; CHECK-NEXT: vand.vx v8, v8, a4 +; CHECK-NEXT: vand.vx v24, v24, a4 +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 2 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vand.vx v16, v16, a2 +; CHECK-NEXT: vand.vx v16, v16, a3 +; CHECK-NEXT: vand.vx v24, v24, a3 ; CHECK-NEXT: vsll.vi v16, v16, 2 ; CHECK-NEXT: vor.vv v16, v24, v16 ; CHECK-NEXT: vsrl.vi v24, v16, 1 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vand.vx v16, v16, a3 +; CHECK-NEXT: vand.vx v16, v16, a4 +; CHECK-NEXT: vand.vx v24, v24, a4 ; CHECK-NEXT: vadd.vv v16, v16, v16 ; CHECK-NEXT: vor.vv v16, v24, v16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index 584f8520ab62f..946ca4d1ab904 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -9,28 +9,28 @@ define void @bitreverse_v8i16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: lui a1, 3 +; CHECK-NEXT: addi a1, a1, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a1, 3 -; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: lui a1, 5 +; CHECK-NEXT: addi a1, a1, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a1, 5 -; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vse16.v v8, (a0) @@ -56,36 +56,36 @@ define void @bitreverse_v4i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: addi a1, a1, -256 -; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsrl.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a1 -; CHECK-NEXT: vsll.vi v10, v10, 8 +; CHECK-NEXT: lui a1, 61681 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: lui a1, 209715 +; CHECK-NEXT: addi a1, a1, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: lui a1, 209715 -; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: lui a1, 349525 +; CHECK-NEXT: addi a1, a1, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a1, 349525 -; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: vse32.v v8, (a0) @@ -113,65 +113,65 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a2, 1044480 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 16 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v9, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v10, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsrl.vi v10, v8, 24 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vlse64.v v11, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v10, v10, a4 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v12, v12, v11 +; RV32-NEXT: addi a2, a5, -256 +; RV32-NEXT: vlse64.v v9, (a6), zero +; RV32-NEXT: vsrl.vx v10, v8, a3 +; RV32-NEXT: vsrl.vx v11, v8, a4 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsll.vx v13, v8, a3 +; RV32-NEXT: vand.vx v11, v11, a2 +; RV32-NEXT: vor.vv v10, v11, v10 +; RV32-NEXT: vand.vx v11, v8, a2 +; RV32-NEXT: vsll.vx v11, v11, a4 +; RV32-NEXT: vor.vv v11, v13, v11 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vand.vv v13, v13, v9 +; RV32-NEXT: vor.vv v12, v13, v12 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: vand.vx v12, v8, a3 -; RV32-NEXT: vsll.vx v12, v12, a2 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: vand.vx v12, v8, a4 -; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a2 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vand.vv v9, v8, v9 +; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v9, v9, 8 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v9, a3 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v11, v8 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v11, a4 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsll.vi v8, v8, 4 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v10, v9 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vse64.v v8, (a0) @@ -184,58 +184,58 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vsrl.vx v9, v8, a1 ; RV64-NEXT: li a2, 40 -; RV64-NEXT: vsrl.vx v10, v8, a2 ; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v10, v10, a3 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vi v10, v8, 24 ; RV64-NEXT: lui a4, 4080 -; RV64-NEXT: vand.vx v10, v10, a4 -; RV64-NEXT: vsrl.vi v11, v8, 8 ; RV64-NEXT: li a5, 255 +; RV64-NEXT: addiw a3, a3, -256 ; RV64-NEXT: slli a5, a5, 24 -; RV64-NEXT: vand.vx v11, v11, a5 -; RV64-NEXT: vor.vv v10, v11, v10 +; RV64-NEXT: vsrl.vx v9, v8, a1 +; RV64-NEXT: vsrl.vx v10, v8, a2 +; RV64-NEXT: vsrl.vi v11, v8, 24 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vand.vx v10, v10, a3 ; RV64-NEXT: vor.vv v9, v10, v9 ; RV64-NEXT: vand.vx v10, v8, a5 +; RV64-NEXT: vand.vx v11, v11, a4 +; RV64-NEXT: vand.vx v12, v12, a5 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a4 ; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vand.vx v11, v8, a4 -; RV64-NEXT: vsll.vi v11, v11, 24 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsll.vx v11, v8, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a1 ; RV64-NEXT: vand.vx v8, v8, a3 ; RV64-NEXT: vsll.vx v8, v8, a2 -; RV64-NEXT: vor.vv v8, v11, v8 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vor.vv v9, v11, v9 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: lui a1, 61681 -; RV64-NEXT: addiw a1, a1, -241 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v9, a1 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v9, v9, a2 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v9, a1 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vand.vx v9, v9, a3 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: vse64.v v8, (a0) @@ -261,28 +261,28 @@ define void @bitreverse_v16i16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: lui a1, 3 +; CHECK-NEXT: addi a1, a1, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a1, 3 -; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: lui a1, 5 +; CHECK-NEXT: addi a1, a1, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a1, 5 -; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vse16.v v8, (a0) @@ -308,36 +308,36 @@ define void @bitreverse_v8i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: addi a1, a1, -256 -; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsrl.vi v12, v8, 24 +; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vor.vv v10, v10, v12 ; CHECK-NEXT: vand.vx v12, v8, a1 -; CHECK-NEXT: vsll.vi v12, v12, 8 +; CHECK-NEXT: lui a1, 61681 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vsll.vi v12, v12, 8 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: lui a1, 209715 +; CHECK-NEXT: addi a1, a1, 819 ; CHECK-NEXT: vsll.vi v8, v8, 4 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a1, 209715 -; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: lui a1, 349525 +; CHECK-NEXT: addi a1, a1, 1365 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a1, 349525 -; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vse32.v v8, (a0) @@ -365,65 +365,65 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a1, 1044480 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a2, 1044480 +; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 16 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v12, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vlse64.v v14, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v14 -; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: vand.vx v16, v8, a3 -; RV32-NEXT: vsll.vx v16, v16, a2 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: addi a2, a5, -256 +; RV32-NEXT: vlse64.v v10, (a6), zero +; RV32-NEXT: vsrl.vx v12, v8, a3 +; RV32-NEXT: vsrl.vx v14, v8, a4 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vsll.vx v18, v8, a3 +; RV32-NEXT: vand.vx v14, v14, a2 +; RV32-NEXT: vor.vv v14, v14, v12 +; RV32-NEXT: vand.vx v12, v8, a2 +; RV32-NEXT: vsll.vx v12, v12, a4 +; RV32-NEXT: vor.vv v12, v18, v12 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vand.vv v18, v18, v10 +; RV32-NEXT: vor.vv v16, v18, v16 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vor.vv v14, v16, v14 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vand.vv v10, v8, v10 +; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v10, v10, 8 ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a3 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v12, a4 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v14 +; RV32-NEXT: vsrl.vi v14, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v14, v14, v16 +; RV32-NEXT: vsll.vi v8, v8, 4 +; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v14, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v14, v10 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vse64.v v8, (a0) @@ -434,60 +434,60 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; RV64-LABEL: bitreverse_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v14, (a0) ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vsrl.vx v10, v8, a1 ; RV64-NEXT: li a2, 40 -; RV64-NEXT: vsrl.vx v12, v8, a2 ; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v12, v12, a3 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vi v12, v8, 24 ; RV64-NEXT: lui a4, 4080 -; RV64-NEXT: vand.vx v12, v12, a4 -; RV64-NEXT: vsrl.vi v14, v8, 8 ; RV64-NEXT: li a5, 255 +; RV64-NEXT: addiw a3, a3, -256 ; RV64-NEXT: slli a5, a5, 24 -; RV64-NEXT: vand.vx v14, v14, a5 -; RV64-NEXT: vor.vv v12, v14, v12 +; RV64-NEXT: vsrl.vx v8, v14, a1 +; RV64-NEXT: vsrl.vx v10, v14, a2 +; RV64-NEXT: vsrl.vi v12, v14, 24 +; RV64-NEXT: vsrl.vi v16, v14, 8 +; RV64-NEXT: vand.vx v10, v10, a3 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vand.vx v18, v14, a5 +; RV64-NEXT: vand.vx v10, v12, a4 +; RV64-NEXT: vand.vx v12, v16, a5 ; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vand.vx v12, v8, a5 -; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vand.vx v14, v8, a4 -; RV64-NEXT: vsll.vi v14, v14, 24 -; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsll.vx v14, v8, a1 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vsll.vx v8, v8, a2 -; RV64-NEXT: vor.vv v8, v14, v8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vand.vx v12, v14, a4 +; RV64-NEXT: vsll.vi v16, v18, 8 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: vsll.vx v16, v14, a1 +; RV64-NEXT: vand.vx v14, v14, a3 +; RV64-NEXT: vsll.vx v14, v14, a2 +; RV64-NEXT: vor.vv v14, v16, v14 ; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 349525 ; RV64-NEXT: addiw a1, a1, -241 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vor.vv v10, v14, v12 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsll.vi v8, v8, 4 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v10, a1 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v10, a1 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vand.vx v10, v10, a3 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index 8bfdf9b6884a2..d765e4c0b8f6a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -136,9 +136,9 @@ define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -178,9 +178,9 @@ define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 @@ -220,9 +220,9 @@ define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: vor.vv v10, v10, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsll.vi v12, v12, 8 @@ -262,9 +262,9 @@ define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: vor.vv v12, v12, v16 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsll.vi v16, v16, 8 @@ -284,38 +284,38 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3, v0.t -; RV32-NEXT: vor.vv v9, v9, v10, v0.t -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v9, v8, a2, v0.t +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vand.vx v10, v8, a1, v0.t ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v11, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4, v0.t -; RV32-NEXT: vsll.vi v11, v11, 24, v0.t -; RV32-NEXT: vand.vv v12, v8, v10, v0.t +; RV32-NEXT: vsll.vx v10, v10, a4, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a5, v0.t +; RV32-NEXT: vsll.vi v10, v10, 24, v0.t +; RV32-NEXT: vand.vv v12, v8, v11, v0.t ; RV32-NEXT: vsll.vi v12, v12, 8, v0.t -; RV32-NEXT: vor.vv v11, v11, v12, v0.t -; RV32-NEXT: vor.vv v9, v9, v11, v0.t -; RV32-NEXT: vsrl.vx v11, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t -; RV32-NEXT: vand.vx v12, v12, a2, v0.t -; RV32-NEXT: vor.vv v11, v12, v11, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a4, v0.t +; RV32-NEXT: vand.vx v12, v12, a1, v0.t +; RV32-NEXT: vor.vv v10, v12, v10, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t -; RV32-NEXT: vand.vx v12, v12, a4, v0.t +; RV32-NEXT: vand.vx v12, v12, a5, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vor.vv v8, v8, v11, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t ; RV32-NEXT: vor.vv v8, v9, v8, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -324,31 +324,31 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vand.vx v9, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v9, v9, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a2, v0.t ; RV64-NEXT: vsll.vi v10, v10, 8, v0.t ; RV64-NEXT: vor.vv v9, v9, v10, v0.t -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v10, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4, v0.t +; RV64-NEXT: vsll.vx v10, v8, a3, v0.t +; RV64-NEXT: vand.vx v11, v8, a0, v0.t +; RV64-NEXT: vsll.vx v11, v11, a5, v0.t ; RV64-NEXT: vor.vv v10, v10, v11, v0.t ; RV64-NEXT: vor.vv v9, v10, v9, v0.t -; RV64-NEXT: vsrl.vx v10, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v11, v8, a4, v0.t -; RV64-NEXT: vand.vx v11, v11, a3, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v11, v8, a5, v0.t +; RV64-NEXT: vand.vx v11, v11, a0, v0.t ; RV64-NEXT: vor.vv v10, v11, v10, v0.t ; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t ; RV64-NEXT: vand.vx v11, v11, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v11, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t @@ -363,39 +363,39 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v9, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v10, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v9, v10 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vand.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vor.vv v11, v12, v11 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v10, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v8, a4 -; RV32-NEXT: vsll.vi v11, v11, 24 -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v10, v10, v13 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vv v13, v13, v12 +; RV32-NEXT: vor.vv v9, v13, v9 +; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v9, v11 -; RV32-NEXT: vsrl.vx v11, v8, a1 -; RV32-NEXT: vsrl.vx v12, v8, a3 -; RV32-NEXT: vand.vx v12, v12, a2 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v11 -; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -403,34 +403,34 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsll.vi v9, v9, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vor.vv v9, v9, v10 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v10, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v11, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v11, v11, a4 -; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vsrl.vi v9, v8, 24 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v11, v8, a3 +; RV64-NEXT: vsrl.vx v12, v8, a5 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vand.vx v10, v10, a2 ; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vx v10, v8, a2 -; RV64-NEXT: vsrl.vx v11, v8, a4 -; RV64-NEXT: vand.vx v11, v11, a3 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsrl.vi v11, v8, 24 -; RV64-NEXT: vand.vx v11, v11, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v11 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v12, v8 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: vor.vv v9, v9, v11 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x i64> %v @@ -444,38 +444,38 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3, v0.t -; RV32-NEXT: vor.vv v10, v10, v12, v0.t -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v10, v8, a2, v0.t +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vand.vx v12, v8, a1, v0.t ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v14, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4, v0.t -; RV32-NEXT: vsll.vi v14, v14, 24, v0.t -; RV32-NEXT: vand.vv v16, v8, v12, v0.t +; RV32-NEXT: vsll.vx v12, v12, a4, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a5, v0.t +; RV32-NEXT: vsll.vi v12, v12, 24, v0.t +; RV32-NEXT: vand.vv v16, v8, v14, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t -; RV32-NEXT: vor.vv v14, v14, v16, v0.t -; RV32-NEXT: vor.vv v10, v10, v14, v0.t -; RV32-NEXT: vsrl.vx v14, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v16, a2, v0.t -; RV32-NEXT: vor.vv v14, v16, v14, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a4, v0.t +; RV32-NEXT: vand.vx v16, v16, a1, v0.t +; RV32-NEXT: vor.vv v12, v16, v12, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t -; RV32-NEXT: vand.vx v16, v16, a4, v0.t +; RV32-NEXT: vand.vx v16, v16, a5, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v14, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t ; RV32-NEXT: vor.vv v8, v10, v8, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -484,31 +484,31 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vand.vx v10, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v10, v10, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a2, v0.t ; RV64-NEXT: vsll.vi v12, v12, 8, v0.t ; RV64-NEXT: vor.vv v10, v10, v12, v0.t -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v12, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4, v0.t +; RV64-NEXT: vsll.vx v12, v8, a3, v0.t +; RV64-NEXT: vand.vx v14, v8, a0, v0.t +; RV64-NEXT: vsll.vx v14, v14, a5, v0.t ; RV64-NEXT: vor.vv v12, v12, v14, v0.t ; RV64-NEXT: vor.vv v10, v12, v10, v0.t -; RV64-NEXT: vsrl.vx v12, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v14, v8, a4, v0.t -; RV64-NEXT: vand.vx v14, v14, a3, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v14, v8, a5, v0.t +; RV64-NEXT: vand.vx v14, v14, a0, v0.t ; RV64-NEXT: vor.vv v12, v14, v12, v0.t ; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t ; RV64-NEXT: vand.vx v14, v14, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v14, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t @@ -523,39 +523,39 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v14, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vand.vx v18, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vor.vv v14, v16, v14 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v8, a4 -; RV32-NEXT: vsll.vi v14, v14, 24 -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsll.vx v18, v18, a4 +; RV32-NEXT: vor.vv v12, v12, v18 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v10, v10, a5 +; RV32-NEXT: vand.vv v18, v18, v16 +; RV32-NEXT: vor.vv v10, v18, v10 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v10, v14 -; RV32-NEXT: vsrl.vx v14, v8, a1 -; RV32-NEXT: vsrl.vx v16, v8, a3 -; RV32-NEXT: vand.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v8, v14 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v10, v10, v14 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -563,34 +563,34 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsll.vi v10, v10, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vor.vv v10, v10, v12 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v12, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v14, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v14, v14, a4 -; RV64-NEXT: vor.vv v12, v12, v14 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v14, v8, a3 +; RV64-NEXT: vsrl.vx v16, v8, a5 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vor.vv v14, v16, v14 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: vand.vx v12, v12, a2 ; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vx v12, v8, a2 -; RV64-NEXT: vsrl.vx v14, v8, a4 -; RV64-NEXT: vand.vx v14, v14, a3 -; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsrl.vi v14, v8, 24 -; RV64-NEXT: vand.vx v14, v14, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v12, v8, a2 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsll.vx v16, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v16, v8 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vor.vv v10, v10, v14 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x i64> %v @@ -604,34 +604,34 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3, v0.t -; RV32-NEXT: vor.vv v16, v12, v16, v0.t -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vand.vx v20, v8, a1, v0.t ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v12, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4, v0.t +; RV32-NEXT: vsll.vx v20, v20, a4, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vand.vx v20, v8, a5, v0.t ; RV32-NEXT: vsll.vi v20, v20, 24, v0.t ; RV32-NEXT: vand.vv v24, v8, v12, v0.t ; RV32-NEXT: vsll.vi v24, v24, 8, v0.t ; RV32-NEXT: vor.vv v20, v20, v24, v0.t ; RV32-NEXT: vor.vv v16, v16, v20, v0.t -; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t +; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a1, v0.t ; RV32-NEXT: vor.vv v20, v24, v20, v0.t ; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a5, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vor.vv v8, v8, v24, v0.t @@ -644,31 +644,31 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vand.vx v12, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v12, v12, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a2, v0.t ; RV64-NEXT: vsll.vi v16, v16, 8, v0.t ; RV64-NEXT: vor.vv v12, v12, v16, v0.t -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v16, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3, v0.t -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4, v0.t +; RV64-NEXT: vsll.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v20, v8, a0, v0.t +; RV64-NEXT: vsll.vx v20, v20, a5, v0.t ; RV64-NEXT: vor.vv v16, v16, v20, v0.t ; RV64-NEXT: vor.vv v12, v16, v12, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v20, v8, a4, v0.t -; RV64-NEXT: vand.vx v20, v20, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v20, v8, a5, v0.t +; RV64-NEXT: vand.vx v20, v20, a0, v0.t ; RV64-NEXT: vor.vv v16, v20, v16, v0.t ; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t ; RV64-NEXT: vand.vx v20, v20, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v20, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t @@ -683,39 +683,39 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 24 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v16, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v16, v16, a3 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v20, v8, a2 +; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vand.vx v28, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a1 +; RV32-NEXT: vor.vv v20, v24, v20 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v8, a4 -; RV32-NEXT: vsll.vi v20, v20, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vx v28, v28, a4 +; RV32-NEXT: vor.vv v16, v16, v28 +; RV32-NEXT: vsrl.vi v28, v8, 8 +; RV32-NEXT: vand.vx v12, v12, a5 +; RV32-NEXT: vand.vv v28, v28, v24 +; RV32-NEXT: vor.vv v12, v28, v12 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vor.vv v12, v12, v20 -; RV32-NEXT: vsrl.vx v20, v8, a1 -; RV32-NEXT: vsrl.vx v24, v8, a3 -; RV32-NEXT: vand.vx v24, v24, a2 -; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v24, v8, 24 -; RV32-NEXT: vand.vx v24, v24, a4 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v8, v20 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v12, v12, v20 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -723,34 +723,34 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_bswap_v8i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsll.vi v12, v12, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsll.vi v16, v16, 8 -; RV64-NEXT: vor.vv v12, v12, v16 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v16, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v20, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v20, v20, a4 -; RV64-NEXT: vor.vv v16, v16, v20 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v20, v8, a3 +; RV64-NEXT: vsrl.vx v24, v8, a5 +; RV64-NEXT: vand.vx v24, v24, a0 +; RV64-NEXT: vor.vv v20, v24, v20 +; RV64-NEXT: vand.vx v24, v8, a1 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vsll.vi v24, v24, 24 +; RV64-NEXT: vand.vx v16, v16, a2 ; RV64-NEXT: vor.vv v12, v16, v12 -; RV64-NEXT: vsrl.vx v16, v8, a2 -; RV64-NEXT: vsrl.vx v20, v8, a4 -; RV64-NEXT: vand.vx v20, v20, a3 -; RV64-NEXT: vor.vv v16, v20, v16 -; RV64-NEXT: vsrl.vi v20, v8, 24 -; RV64-NEXT: vand.vx v20, v20, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsll.vx v24, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v24, v8 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: vor.vv v12, v12, v20 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x i64> %v @@ -769,33 +769,33 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vsll.vx v24, v24, a4, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t ; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill @@ -814,14 +814,14 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a1, v0.t ; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a3, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 @@ -857,36 +857,35 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v16, v16, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vand.vx v24, v8, a2, v0.t ; RV64-NEXT: vsll.vi v24, v24, 8, v0.t ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vx v24, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vx v16, v16, a5, v0.t ; RV64-NEXT: vor.vv v16, v24, v16, v0.t -; RV64-NEXT: addi a5, sp, 16 -; RV64-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t +; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vor.vv v24, v16, v24, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t ; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v8, v24, v0.t ; RV64-NEXT: addi a0, sp, 16 @@ -909,51 +908,59 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v24, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: vsll.vx v0, v0, a4 +; RV32-NEXT: vor.vv v16, v24, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v0, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -962,35 +969,51 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_bswap_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: vsll.vx v0, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v0, v8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x i64> %v @@ -1009,33 +1032,33 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vsll.vx v24, v24, a4, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t ; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill @@ -1054,14 +1077,14 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v24, v24, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a1, v0.t ; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vand.vx v24, v24, a3, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 @@ -1097,36 +1120,35 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: addiw a0, a4, -256 ; RV64-NEXT: vsll.vi v16, v16, 24, v0.t -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vand.vx v24, v8, a2, v0.t ; RV64-NEXT: vsll.vi v24, v24, 8, v0.t ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2, v0.t -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vx v24, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vx v16, v16, a5, v0.t ; RV64-NEXT: vor.vv v16, v24, v16, v0.t -; RV64-NEXT: addi a5, sp, 16 -; RV64-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t +; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vor.vv v24, v16, v24, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t ; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v8, v24, v0.t ; RV64-NEXT: addi a0, sp, 16 @@ -1149,51 +1171,59 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2 -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vsll.vx v24, v8, a2 +; RV32-NEXT: addi a1, a3, -256 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v0, v8, a4 +; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: vsll.vx v0, v0, a4 +; RV32-NEXT: vor.vv v16, v24, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a4), zero -; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vlse64.v v0, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v0, v8, a3 -; RV32-NEXT: vand.vx v0, v0, a2 -; RV32-NEXT: vsrl.vx v24, v8, a1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -1202,35 +1232,51 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_bswap_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: lui a4, 16 +; RV64-NEXT: li a5, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsll.vi v16, v16, 24 -; RV64-NEXT: li a0, 255 -; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: vand.vx v24, v8, a0 -; RV64-NEXT: vsll.vi v24, v24, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: li a2, 56 -; RV64-NEXT: vsll.vx v24, v8, a2 -; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v0, v8, a3 -; RV64-NEXT: li a4, 40 -; RV64-NEXT: vsll.vx v0, v0, a4 -; RV64-NEXT: vor.vv v24, v24, v0 -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vsrl.vx v24, v8, a2 -; RV64-NEXT: vsrl.vx v0, v8, a4 -; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: addiw a0, a4, -256 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: vsrl.vx v0, v8, a5 +; RV64-NEXT: vand.vx v0, v0, a0 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 8 +; RV64-NEXT: slli a2, a2, 24 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v0, v0, a2 ; RV64-NEXT: vor.vv v24, v0, v24 -; RV64-NEXT: vsrl.vi v0, v8, 24 -; RV64-NEXT: vand.vx v0, v0, a1 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v0, v8, a1 +; RV64-NEXT: vsll.vi v0, v0, 24 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v16, v0, v16 +; RV64-NEXT: vsll.vx v0, v8, a3 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v0 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vsll.vx v8, v8, a5 +; RV64-NEXT: vor.vv v8, v0, v8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index 1dff8aed06054..5e491f21e6213 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -35,15 +35,15 @@ define void @bswap_v4i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: addi a1, a1, -256 -; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsrl.vi v10, v8, 24 +; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: vor.vv v9, v9, v10 ; CHECK-NEXT: vand.vx v10, v8, a1 -; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse32.v v8, (a0) @@ -72,36 +72,36 @@ define void @bswap_v2i64(ptr %x, ptr %y) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: lui a4, 16 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v9, v8, a1 -; RV32-NEXT: li a2, 40 +; RV32-NEXT: addi a1, a4, -256 +; RV32-NEXT: vlse64.v v9, (a6), zero ; RV32-NEXT: vsrl.vx v10, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v10, v10, a3 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsrl.vi v10, v8, 24 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vlse64.v v11, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v10, v10, a4 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v12, v12, v11 +; RV32-NEXT: vsrl.vx v11, v8, a3 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsll.vx v13, v8, a2 +; RV32-NEXT: vand.vx v11, v11, a1 +; RV32-NEXT: vor.vv v10, v11, v10 +; RV32-NEXT: vand.vx v11, v8, a1 +; RV32-NEXT: vsll.vx v11, v11, a3 +; RV32-NEXT: vor.vv v11, v13, v11 +; RV32-NEXT: vsrl.vi v13, v8, 8 +; RV32-NEXT: vand.vx v12, v12, a5 +; RV32-NEXT: vand.vv v13, v13, v9 +; RV32-NEXT: vor.vv v12, v13, v12 +; RV32-NEXT: vand.vv v9, v8, v9 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v9, v9, 8 ; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsll.vx v10, v8, a1 -; RV32-NEXT: vand.vx v12, v8, a3 -; RV32-NEXT: vsll.vx v12, v12, a2 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: vand.vx v12, v8, a4 -; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -112,31 +112,31 @@ define void @bswap_v2i64(ptr %x, ptr %y) { ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vsrl.vx v9, v8, a1 ; RV64-NEXT: li a2, 40 -; RV64-NEXT: vsrl.vx v10, v8, a2 ; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v10, v10, a3 -; RV64-NEXT: vor.vv v9, v10, v9 -; RV64-NEXT: vsrl.vi v10, v8, 24 ; RV64-NEXT: lui a4, 4080 -; RV64-NEXT: vand.vx v10, v10, a4 -; RV64-NEXT: vsrl.vi v11, v8, 8 ; RV64-NEXT: li a5, 255 +; RV64-NEXT: addiw a3, a3, -256 ; RV64-NEXT: slli a5, a5, 24 -; RV64-NEXT: vand.vx v11, v11, a5 -; RV64-NEXT: vor.vv v10, v11, v10 +; RV64-NEXT: vsrl.vx v9, v8, a1 +; RV64-NEXT: vsrl.vx v10, v8, a2 +; RV64-NEXT: vsrl.vi v11, v8, 24 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vand.vx v10, v10, a3 ; RV64-NEXT: vor.vv v9, v10, v9 ; RV64-NEXT: vand.vx v10, v8, a5 +; RV64-NEXT: vand.vx v11, v11, a4 +; RV64-NEXT: vand.vx v12, v12, a5 +; RV64-NEXT: vor.vv v11, v12, v11 +; RV64-NEXT: vand.vx v12, v8, a4 ; RV64-NEXT: vsll.vi v10, v10, 8 -; RV64-NEXT: vand.vx v11, v8, a4 -; RV64-NEXT: vsll.vi v11, v11, 24 -; RV64-NEXT: vor.vv v10, v11, v10 -; RV64-NEXT: vsll.vx v11, v8, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsll.vx v12, v8, a1 ; RV64-NEXT: vand.vx v8, v8, a3 ; RV64-NEXT: vsll.vx v8, v8, a2 -; RV64-NEXT: vor.vv v8, v11, v8 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: vor.vv v9, v11, v9 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vse64.v v8, (a0) @@ -188,15 +188,15 @@ define void @bswap_v8i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: addi a1, a1, -256 -; CHECK-NEXT: vand.vx v10, v10, a1 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsrl.vi v12, v8, 24 +; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: vor.vv v10, v10, v12 ; CHECK-NEXT: vand.vx v12, v8, a1 -; CHECK-NEXT: vsll.vi v12, v12, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vsll.vi v12, v12, 8 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vse32.v v8, (a0) @@ -225,36 +225,36 @@ define void @bswap_v4i64(ptr %x, ptr %y) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: lui a4, 16 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: li a2, 40 +; RV32-NEXT: addi a1, a4, -256 +; RV32-NEXT: vlse64.v v10, (a6), zero ; RV32-NEXT: vsrl.vx v12, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v12, v12, a3 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: addi a4, sp, 8 -; RV32-NEXT: vlse64.v v14, (a4), zero -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v14 +; RV32-NEXT: vsrl.vx v14, v8, a3 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vsll.vx v18, v8, a2 +; RV32-NEXT: vand.vx v14, v14, a1 +; RV32-NEXT: vor.vv v12, v14, v12 +; RV32-NEXT: vand.vx v14, v8, a1 +; RV32-NEXT: vsll.vx v14, v14, a3 +; RV32-NEXT: vor.vv v14, v18, v14 +; RV32-NEXT: vsrl.vi v18, v8, 8 +; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: vand.vv v18, v18, v10 +; RV32-NEXT: vor.vv v16, v18, v16 +; RV32-NEXT: vand.vv v10, v8, v10 +; RV32-NEXT: vand.vx v8, v8, a5 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v10, v10, 8 ; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: vand.vx v16, v8, a3 -; RV32-NEXT: vsll.vx v16, v16, a2 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: vand.vx v16, v8, a4 -; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vsll.vi v8, v8, 8 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -265,31 +265,31 @@ define void @bswap_v4i64(ptr %x, ptr %y) { ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vsrl.vx v10, v8, a1 ; RV64-NEXT: li a2, 40 -; RV64-NEXT: vsrl.vx v12, v8, a2 ; RV64-NEXT: lui a3, 16 -; RV64-NEXT: addiw a3, a3, -256 -; RV64-NEXT: vand.vx v12, v12, a3 -; RV64-NEXT: vor.vv v10, v12, v10 -; RV64-NEXT: vsrl.vi v12, v8, 24 ; RV64-NEXT: lui a4, 4080 -; RV64-NEXT: vand.vx v12, v12, a4 -; RV64-NEXT: vsrl.vi v14, v8, 8 ; RV64-NEXT: li a5, 255 +; RV64-NEXT: addiw a3, a3, -256 ; RV64-NEXT: slli a5, a5, 24 -; RV64-NEXT: vand.vx v14, v14, a5 -; RV64-NEXT: vor.vv v12, v14, v12 +; RV64-NEXT: vsrl.vx v10, v8, a1 +; RV64-NEXT: vsrl.vx v12, v8, a2 +; RV64-NEXT: vsrl.vi v14, v8, 24 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vand.vx v12, v12, a3 ; RV64-NEXT: vor.vv v10, v12, v10 ; RV64-NEXT: vand.vx v12, v8, a5 +; RV64-NEXT: vand.vx v14, v14, a4 +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vor.vv v14, v16, v14 +; RV64-NEXT: vand.vx v16, v8, a4 ; RV64-NEXT: vsll.vi v12, v12, 8 -; RV64-NEXT: vand.vx v14, v8, a4 -; RV64-NEXT: vsll.vi v14, v14, 24 -; RV64-NEXT: vor.vv v12, v14, v12 -; RV64-NEXT: vsll.vx v14, v8, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsll.vx v16, v8, a1 ; RV64-NEXT: vand.vx v8, v8, a3 ; RV64-NEXT: vsll.vx v8, v8, a2 -; RV64-NEXT: vor.vv v8, v14, v8 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vor.vv v10, v14, v10 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index 5d75efe681af7..dbbb8362144ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -31,12 +31,12 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) ; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a5 ; CHECK-NEXT: vslide1down.vx v8, v8, a6 @@ -118,22 +118,22 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a2 -; CHECK-NEXT: vslide1down.vx v8, v8, a3 -; CHECK-NEXT: vmulhu.vv v9, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9 -; CHECK-NEXT: vmv.v.i v11, 0 -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vslide1down.vx v11, v11, a0 +; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI4_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_1) -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vmulhu.vv v10, v10, v11 -; CHECK-NEXT: vadd.vv v9, v10, v9 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 +; CHECK-NEXT: vle32.v v11, (a0) +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vmulhu.vv v10, v8, v10 +; CHECK-NEXT: vsub.vv v12, v8, v10 +; CHECK-NEXT: vmulhu.vv v9, v12, v9 +; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vsrl.vv v9, v9, v12 +; CHECK-NEXT: vsrl.vv v9, v9, v11 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %e0 = udiv i32 %a, 23 @@ -224,12 +224,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d) ; RV32-NEXT: addi a1, a1, 25 ; RV32-NEXT: addi a2, a2, 1 ; RV32-NEXT: addi a3, a3, 2047 -; RV32-NEXT: addi a3, a3, 308 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: addi a0, a3, 308 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: add_constant_rhs_with_identity: @@ -237,12 +237,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d) ; RV64-NEXT: addiw a1, a1, 25 ; RV64-NEXT: addiw a2, a2, 1 ; RV64-NEXT: addi a3, a3, 2047 -; RV64-NEXT: addiw a3, a3, 308 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: addiw a0, a3, 308 ; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: ret %e0 = add i32 %a, 0 %e1 = add i32 %b, 25 @@ -261,12 +261,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) { ; RV32-NEXT: addi a1, a1, 25 ; RV32-NEXT: addi a2, a2, 1 ; RV32-NEXT: addi a3, a3, 2047 -; RV32-NEXT: addi a3, a3, 308 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: addi a0, a3, 308 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: add_constant_rhs_identity: @@ -274,12 +274,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) { ; RV64-NEXT: addiw a1, a1, 25 ; RV64-NEXT: addiw a2, a2, 1 ; RV64-NEXT: addi a3, a3, 2047 -; RV64-NEXT: addiw a3, a3, 308 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: addiw a0, a3, 308 ; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: ret %e0 = add i32 %a, 0 %e1 = add i32 %b, 25 @@ -562,20 +562,21 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.s.x v12, a1 ; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 ; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 ; CHECK-NEXT: lui a0, %hi(.LCPI19_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 5 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vmv.s.x v12, a3 -; CHECK-NEXT: vslideup.vi v8, v12, 7 -; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: vmv.s.x v10, a3 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %vadd = add <8 x i32> %vin, %e0 = add i32 %a, 23 @@ -598,9 +599,9 @@ define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: srli a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: slli a3, a3, 31 ; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: or a2, a2, a3 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll index 3c090bb900311..ee953a66a004f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -85,16 +85,16 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a2, a1, 256 -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: addi a1, a1, 384 +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a2, a1, 384 ; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: vse32.v v16, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vse32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vse32.v v0, (a1) +; CHECK-NEXT: vle32.v v0, (a2) +; CHECK-NEXT: addi a2, a0, 256 +; CHECK-NEXT: vse32.v v24, (a0) ; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vse32.v v0, (a1) +; CHECK-NEXT: vse32.v v16, (a2) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x @@ -257,9 +257,7 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3 ; CHECK-NEXT: .cfi_def_cfa s0, 0 ; CHECK-NEXT: andi sp, sp, -128 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: mv t0, sp ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 @@ -268,8 +266,10 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3 ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 ; CHECK-NEXT: mv t3, sp +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: li t4, 8 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v8, (t0) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_indirect_stack @@ -306,19 +306,17 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3 define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { ; CHECK-LABEL: pass_vector_arg_direct_stack: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -160 -; CHECK-NEXT: .cfi_def_cfa_offset 160 -; CHECK-NEXT: sd ra, 152(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 176 +; CHECK-NEXT: sd ra, 168(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 160(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 +; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: sd a0, 144(sp) -; CHECK-NEXT: li a0, 13 -; CHECK-NEXT: li t0, 12 +; CHECK-NEXT: addi t0, sp, 16 +; CHECK-NEXT: li t1, 1 +; CHECK-NEXT: li t2, 13 +; CHECK-NEXT: li s0, 12 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 @@ -327,17 +325,23 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 ; CHECK-NEXT: li t3, 8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vse32.v v8, (t0) ; CHECK-NEXT: li t4, 9 ; CHECK-NEXT: li t5, 10 +; CHECK-NEXT: sd t1, 144(sp) ; CHECK-NEXT: li t6, 11 -; CHECK-NEXT: sd t0, 0(sp) -; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: sd s0, 0(sp) +; CHECK-NEXT: sd t2, 8(sp) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_direct_stack -; CHECK-NEXT: ld ra, 152(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld ra, 168(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 160(sp) # 8-byte Folded Reload ; CHECK-NEXT: .cfi_restore ra -; CHECK-NEXT: addi sp, sp, 160 +; CHECK-NEXT: .cfi_restore s0 +; CHECK-NEXT: addi sp, sp, 176 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll index fcdb5d5cb6aef..73e148edbe2d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -85,16 +85,16 @@ define <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a2, a1, 256 -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: addi a1, a1, 384 +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a2, a1, 384 ; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: vse32.v v16, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vse32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vse32.v v0, (a1) +; CHECK-NEXT: vle32.v v0, (a2) +; CHECK-NEXT: addi a2, a0, 256 +; CHECK-NEXT: vse32.v v24, (a0) ; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vse32.v v0, (a1) +; CHECK-NEXT: vse32.v v16, (a2) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x @@ -312,18 +312,18 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x ; CHECK-NEXT: sd ra, 136(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vse32.v v8, (sp) -; CHECK-NEXT: li a0, 8 +; CHECK-NEXT: li t0, 8 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 ; CHECK-NEXT: li a4, 4 ; CHECK-NEXT: li a5, 5 ; CHECK-NEXT: li a6, 6 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vse32.v v8, (sp) ; CHECK-NEXT: li a7, 7 -; CHECK-NEXT: sd a0, 128(sp) +; CHECK-NEXT: sd t0, 128(sp) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_via_stack @@ -358,25 +358,27 @@ define <4 x i1> @pass_vector_mask_arg_via_stack(<4 x i1> %v) { ; CHECK-NEXT: sd ra, 152(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vse32.v v8, (sp) -; CHECK-NEXT: li a0, 8 -; CHECK-NEXT: sd a0, 128(sp) +; CHECK-NEXT: li a1, 8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: addi a2, sp, 136 +; CHECK-NEXT: li a5, 5 +; CHECK-NEXT: li a6, 6 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: sd a1, 128(sp) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (sp) ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vmv.v.v v17, v16 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmsne.vi v16, v17, 0 -; CHECK-NEXT: addi a0, sp, 136 -; CHECK-NEXT: li a5, 5 -; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 -; CHECK-NEXT: vsm.v v16, (a0) +; CHECK-NEXT: vsm.v v16, (a2) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: li a2, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index 29f437829f3be..511242aa677c2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -32,10 +32,10 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -72,10 +72,10 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -114,10 +114,10 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -154,10 +154,10 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -197,10 +197,10 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vmv1r.v v9, v0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v9, v12, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -238,10 +238,10 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.ceil.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -283,10 +283,10 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e ; ZVFHMIN-NEXT: vmv1r.v v10, v0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v10, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 3 @@ -324,10 +324,10 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.ceil.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.ceil.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.ceil.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 3 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -808,27 +818,30 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: fsrmi a2, 3 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a1, 3 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: fsrm a2 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index e2d7ed55c4601..9d0d42cf754c5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -11,6 +11,7 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -18,10 +19,9 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -39,6 +39,7 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -46,10 +47,9 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -69,6 +69,7 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -76,10 +77,9 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -97,6 +97,7 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -104,10 +105,9 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -127,6 +127,7 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -134,10 +135,9 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -155,6 +155,7 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -162,10 +163,9 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -185,6 +185,7 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -192,10 +193,9 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -213,6 +213,7 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -220,10 +221,9 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -243,7 +243,9 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -252,20 +254,18 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -280,7 +280,9 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -289,20 +291,18 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -319,7 +319,9 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -328,20 +330,18 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -356,7 +356,9 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -365,20 +367,18 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -395,7 +395,9 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -404,20 +406,18 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -432,7 +432,9 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -441,20 +443,18 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -471,7 +471,9 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t @@ -480,20 +482,18 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -508,7 +508,9 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 @@ -517,20 +519,18 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -547,7 +547,9 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -558,20 +560,18 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -587,7 +587,9 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -598,20 +600,18 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -629,7 +629,9 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -640,20 +642,18 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -669,7 +669,9 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -680,20 +682,18 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -711,7 +711,9 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t @@ -722,20 +724,18 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -751,7 +751,9 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 @@ -762,20 +764,18 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -793,7 +793,9 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t @@ -804,20 +806,18 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -833,7 +833,9 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v12, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 4 @@ -844,20 +846,18 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -875,6 +875,12 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t @@ -884,49 +890,60 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t @@ -936,38 +953,21 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v9, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl) @@ -979,6 +979,12 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v9 @@ -988,40 +994,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1031,6 +1031,23 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v9 @@ -1040,37 +1057,20 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsrl.vx v9, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1084,59 +1084,76 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v10, v12, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t @@ -1146,38 +1163,21 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl) @@ -1189,6 +1189,12 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v10 @@ -1198,40 +1204,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1241,6 +1241,23 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v10 @@ -1250,37 +1267,20 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsrl.vx v10, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1294,59 +1294,76 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v12, v8, v12, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v12, v16, v12, v0.t +; RV32-NEXT: vand.vv v16, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v8, v12, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t @@ -1356,38 +1373,21 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl) @@ -1399,6 +1399,12 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v12 @@ -1408,40 +1414,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1451,6 +1451,23 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v12 @@ -1460,37 +1477,20 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vsrl.vx v12, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1522,11 +1522,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1536,58 +1546,52 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1601,6 +1605,23 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -1610,38 +1631,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) @@ -1666,46 +1670,48 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vx v0, v8, a1 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1717,6 +1723,23 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -1726,37 +1749,20 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v16, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1788,11 +1794,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1802,58 +1818,52 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1867,6 +1877,23 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -1876,38 +1903,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) @@ -1932,46 +1942,48 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vx v0, v8, a1 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1983,6 +1995,23 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -1992,37 +2021,20 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v16, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -2050,29 +2062,32 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a1, 16 +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: bltu a0, a1, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: li a1, 32 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -2082,37 +2097,31 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a5, 40 +; RV32-NEXT: mul a3, a3, a5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vnot.v v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -2373,6 +2382,28 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: .LBB34_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: addiw a3, a3, 819 +; RV64-NEXT: addiw a6, a4, -241 +; RV64-NEXT: addiw a7, a5, 257 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a5, a2, a5 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a4, a3, a4 +; RV64-NEXT: slli a2, a6, 32 +; RV64-NEXT: add a2, a6, a2 +; RV64-NEXT: slli a3, a7, 32 +; RV64-NEXT: add a3, a7, a3 +; RV64-NEXT: addi a6, a0, -16 +; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a6, a0, a6 +; RV64-NEXT: li a0, 56 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -2382,52 +2413,30 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a5, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a4, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a4, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: addi a7, sp, 16 ; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a7, vlenb ; RV64-NEXT: slli a7, a7, 3 ; RV64-NEXT: add a7, sp, a7 ; RV64-NEXT: addi a7, a7, 16 ; RV64-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v16, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t @@ -2442,17 +2451,17 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a5, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a4, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a4, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: csrr a0, vlenb @@ -2475,113 +2484,144 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: bltu a0, a3, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vnot.v v0, v8 ; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a0, -16 +; RV32-NEXT: sltu a0, a0, a3 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v0, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 8 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vand.vv v24, v24, v16 ; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v0, v16, a2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: vand.vv v24, v24, v8 ; RV32-NEXT: vadd.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vadd.vv v24, v24, v0 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 2 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 8 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 16 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vx v0, v24, a2 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vsrl.vi v0, v24, 1 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v24 ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsub.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v16, v8 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsub.vv v0, v16, v0 +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v8, v0, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v0, v24 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 @@ -2607,78 +2647,100 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: .LBB35_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a7, a3, 1365 +; RV64-NEXT: addiw a3, a4, 819 +; RV64-NEXT: addiw a4, a5, -241 +; RV64-NEXT: addiw a6, a6, 257 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: add a7, a7, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a5, a3, a5 +; RV64-NEXT: slli a3, a4, 32 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a4, a6, 32 +; RV64-NEXT: add a4, a6, a4 +; RV64-NEXT: addi a6, a0, -16 +; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a6, a0, a6 +; RV64-NEXT: li a0, 56 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 4 ; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v24, v8, a1 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 2 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 8 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 16 ; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vx v24, v16, a1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a7 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v24, v16, a2 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a5 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vand.vx v24, v24, a7 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a5 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a4 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vmul.vx v16, v16, a4 +; RV64-NEXT: vsrl.vx v16, v16, a0 ; RV64-NEXT: ret %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v @@ -2689,6 +2751,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -2696,10 +2759,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2717,6 +2779,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -2724,10 +2787,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2745,6 +2807,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -2752,10 +2815,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2773,6 +2835,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -2780,10 +2843,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2801,6 +2863,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -2808,10 +2871,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2829,6 +2891,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -2836,10 +2899,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2857,6 +2919,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t @@ -2864,10 +2927,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2885,6 +2947,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 @@ -2892,10 +2955,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2913,7 +2975,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -2922,20 +2986,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -2950,7 +3012,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -2959,20 +3023,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2987,7 +3049,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -2996,20 +3060,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -3024,7 +3086,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -3033,20 +3097,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -3061,7 +3123,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -3070,20 +3134,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -3098,7 +3160,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -3107,20 +3171,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -3135,7 +3197,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t @@ -3144,20 +3208,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -3172,7 +3234,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 @@ -3181,20 +3245,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -3209,7 +3271,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -3220,20 +3284,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3249,7 +3311,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -3260,20 +3324,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3289,7 +3351,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t @@ -3300,20 +3364,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3329,7 +3391,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v9, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v9, v8, 4 @@ -3340,20 +3404,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3369,7 +3431,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t @@ -3380,20 +3444,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3409,7 +3471,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v10, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsrl.vi v10, v8, 4 @@ -3420,20 +3484,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext % ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3449,7 +3511,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t @@ -3460,20 +3524,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3489,7 +3551,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vsrl.vi v12, v8, 2 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vsrl.vi v12, v8, 4 @@ -3500,20 +3564,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vnot.v v8, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3529,6 +3591,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t @@ -3538,49 +3606,60 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t @@ -3590,38 +3669,21 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v9, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl) @@ -3633,6 +3695,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v9 @@ -3642,40 +3710,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3685,6 +3747,23 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v9 @@ -3694,37 +3773,20 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsrl.vx v9, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3736,59 +3798,76 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; RV32-LABEL: vp_ctlz_zero_undef_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v10, v12, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t @@ -3798,38 +3877,21 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl) @@ -3840,7 +3902,13 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v10 @@ -3850,40 +3918,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3893,6 +3955,23 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v10 @@ -3902,37 +3981,20 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsrl.vx v10, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3944,59 +4006,76 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; RV32-LABEL: vp_ctlz_zero_undef_v8i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v12, v8, v12, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v12, v16, v12, v0.t +; RV32-NEXT: vand.vv v16, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v8, v12, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t @@ -4006,38 +4085,21 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl) @@ -4049,6 +4111,12 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV32: # %bb.0: ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v12 @@ -4058,40 +4126,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4101,6 +4163,23 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v12 @@ -4110,37 +4189,20 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vsrl.vx v12, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -4170,11 +4232,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4184,58 +4256,52 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -4249,6 +4315,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -4258,38 +4341,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl) @@ -4314,46 +4380,48 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vx v0, v8, a1 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4365,6 +4433,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -4374,37 +4459,20 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v16, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -4434,11 +4502,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4448,58 +4526,52 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -4513,6 +4585,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -4522,38 +4611,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl) @@ -4578,46 +4650,48 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vx v0, v8, a1 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4629,6 +4703,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV64: # %bb.0: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: lui a1, 209715 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: lui a3, 4112 +; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: addiw a1, a1, 819 +; RV64-NEXT: addiw a2, a2, -241 +; RV64-NEXT: addiw a3, a3, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: li a4, 32 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -4638,37 +4729,20 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vsrl.vx v16, v8, a4 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -4694,29 +4768,32 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a1, 16 +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: bltu a0, a1, .LBB70_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: li a1, 32 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4726,37 +4803,31 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a5, 40 +; RV32-NEXT: mul a3, a3, a5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vnot.v v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -5017,6 +5088,28 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: .LBB70_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: addiw a3, a3, 819 +; RV64-NEXT: addiw a6, a4, -241 +; RV64-NEXT: addiw a7, a5, 257 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a5, a2, a5 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a4, a3, a4 +; RV64-NEXT: slli a2, a6, 32 +; RV64-NEXT: add a2, a6, a2 +; RV64-NEXT: slli a3, a7, 32 +; RV64-NEXT: add a3, a7, a3 +; RV64-NEXT: addi a6, a0, -16 +; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a6, a0, a6 +; RV64-NEXT: li a0, 56 ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -5026,52 +5119,30 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a5, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a4, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a4, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: addi a7, sp, 16 ; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a7, vlenb ; RV64-NEXT: slli a7, a7, 3 ; RV64-NEXT: add a7, sp, a7 ; RV64-NEXT: addi a7, a7, 16 ; RV64-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v16, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t @@ -5086,17 +5157,17 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a5, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a4, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a4, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: csrr a0, vlenb @@ -5119,113 +5190,144 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB71_2 +; RV32-NEXT: bltu a0, a3, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vnot.v v0, v8 ; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a0, -16 +; RV32-NEXT: sltu a0, a0, a3 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v0, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 8 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vand.vv v24, v24, v16 ; RV32-NEXT: vsub.vv v24, v0, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v0, v16, a2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: vand.vv v24, v24, v8 ; RV32-NEXT: vadd.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vadd.vv v24, v24, v0 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 2 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 8 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vi v0, v24, 16 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vsrl.vx v0, v24, a2 -; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vsrl.vi v0, v24, 1 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v24 ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsub.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v16, v8 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsub.vv v0, v16, v0 +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v8, v0, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v0, v24 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 @@ -5251,78 +5353,100 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV64-NEXT: .LBB71_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a7, a3, 1365 +; RV64-NEXT: addiw a3, a4, 819 +; RV64-NEXT: addiw a4, a5, -241 +; RV64-NEXT: addiw a6, a6, 257 +; RV64-NEXT: slli a5, a7, 32 +; RV64-NEXT: add a7, a7, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a5, a3, a5 +; RV64-NEXT: slli a3, a4, 32 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a4, a6, 32 +; RV64-NEXT: add a4, a6, a4 +; RV64-NEXT: addi a6, a0, -16 +; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a6, a0, a6 +; RV64-NEXT: li a0, 56 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 4 ; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v24, v8, a1 -; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 2 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 8 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 16 ; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vx v24, v16, a1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a7 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v24, v16, a2 ; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a5 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vand.vx v24, v24, a7 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a5 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a4 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vmul.vx v16, v16, a4 +; RV64-NEXT: vsrl.vx v16, v16, a0 ; RV64-NEXT: ret %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 228a9f0d6d522..4bd4a9a854f36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -13,6 +13,7 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVI-NEXT: vle8.v v8, (a0) +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -21,10 +22,9 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -39,12 +39,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle8.v v8, (a0) +; RVF-NEXT: li a1, 134 ; RVF-NEXT: vzext.vf2 v10, v8 ; RVF-NEXT: vfwcvt.f.xu.v v12, v10 ; RVF-NEXT: vnsrl.wi v8, v12, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v10, v8, 0 -; RVF-NEXT: li a1, 134 ; RVF-NEXT: vrsub.vx v8, v10, a1 ; RVF-NEXT: li a1, 8 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -55,12 +55,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle8.v v8, (a0) +; RVD-NEXT: li a1, 134 ; RVD-NEXT: vzext.vf2 v10, v8 ; RVD-NEXT: vfwcvt.f.xu.v v12, v10 ; RVD-NEXT: vnsrl.wi v8, v12, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v10, v8, 0 -; RVD-NEXT: li a1, 134 ; RVD-NEXT: vrsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 8 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -87,6 +87,8 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVI-NEXT: vle16.v v8, (a0) +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -97,20 +99,18 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -122,9 +122,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVF-NEXT: vle16.v v8, (a0) +; RVF-NEXT: li a1, 142 ; RVF-NEXT: vfwcvt.f.xu.v v10, v8 ; RVF-NEXT: vnsrl.wi v8, v10, 23 -; RVF-NEXT: li a1, 142 ; RVF-NEXT: vrsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -135,9 +135,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVD-NEXT: vle16.v v8, (a0) +; RVD-NEXT: li a1, 142 ; RVD-NEXT: vfwcvt.f.xu.v v10, v8 ; RVD-NEXT: vnsrl.wi v8, v10, 23 -; RVD-NEXT: li a1, 142 ; RVD-NEXT: vrsub.vx v8, v8, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -164,6 +164,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVI-NEXT: vle32.v v8, (a0) +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -176,20 +178,18 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -205,8 +205,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v8, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v8, v8, 23 ; RVF-NEXT: li a1, 158 +; RVF-NEXT: vsrl.vi v8, v8, 23 ; RVF-NEXT: vrsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -217,8 +217,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v8, (a0) -; RVD-NEXT: vfwcvt.f.xu.v v10, v8 ; RVD-NEXT: li a1, 52 +; RVD-NEXT: vfwcvt.f.xu.v v10, v8 ; RVD-NEXT: vnsrl.wx v8, v10, a1 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -247,50 +247,50 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 2 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 8 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 16 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: li a1, 32 -; RV32I-NEXT: vsrl.vx v9, v8, a1 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: vmv.v.x v9, a1 +; RV32I-NEXT: li a1, 32 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vx v10, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -301,6 +301,23 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 32 ; RV64I-NEXT: vsrl.vi v9, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 2 @@ -311,37 +328,20 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v9 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: vsrl.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vx v9, v8, a5 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v9, v9, a1 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vx v9, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -401,6 +401,7 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: li a1, 32 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVI-NEXT: vle8.v v8, (a0) +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -409,10 +410,9 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -428,12 +428,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RVF-NEXT: vle8.v v8, (a0) +; RVF-NEXT: li a1, 134 ; RVF-NEXT: vzext.vf2 v12, v8 ; RVF-NEXT: vfwcvt.f.xu.v v16, v12 ; RVF-NEXT: vnsrl.wi v8, v16, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v12, v8, 0 -; RVF-NEXT: li a1, 134 ; RVF-NEXT: vrsub.vx v8, v12, a1 ; RVF-NEXT: li a1, 8 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -445,12 +445,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RVD-NEXT: vle8.v v8, (a0) +; RVD-NEXT: li a1, 134 ; RVD-NEXT: vzext.vf2 v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v16, v12 ; RVD-NEXT: vnsrl.wi v8, v16, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v12, v8, 0 -; RVD-NEXT: li a1, 134 ; RVD-NEXT: vrsub.vx v8, v12, a1 ; RVD-NEXT: li a1, 8 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -478,6 +478,8 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVI-NEXT: vle16.v v8, (a0) +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -488,20 +490,18 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -513,9 +513,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle16.v v8, (a0) +; RVF-NEXT: li a1, 142 ; RVF-NEXT: vfwcvt.f.xu.v v12, v8 ; RVF-NEXT: vnsrl.wi v8, v12, 23 -; RVF-NEXT: li a1, 142 ; RVF-NEXT: vrsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -526,9 +526,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle16.v v8, (a0) +; RVD-NEXT: li a1, 142 ; RVD-NEXT: vfwcvt.f.xu.v v12, v8 ; RVD-NEXT: vnsrl.wi v8, v12, 23 -; RVD-NEXT: li a1, 142 ; RVD-NEXT: vrsub.vx v8, v8, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -555,6 +555,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVI-NEXT: vle32.v v8, (a0) +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -567,20 +569,18 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -596,8 +596,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { ; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v8, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v8, v8, 23 ; RVF-NEXT: li a1, 158 +; RVF-NEXT: vsrl.vi v8, v8, 23 ; RVF-NEXT: vrsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -608,8 +608,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v8, (a0) -; RVD-NEXT: vfwcvt.f.xu.v v12, v8 ; RVD-NEXT: li a1, 52 +; RVD-NEXT: vfwcvt.f.xu.v v12, v8 ; RVD-NEXT: vnsrl.wx v8, v12, a1 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -638,50 +638,50 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 2 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 8 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 16 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: li a1, 32 -; RV32I-NEXT: vsrl.vx v10, v8, a1 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 +; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: li a1, 32 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vx v12, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -692,6 +692,23 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 32 ; RV64I-NEXT: vsrl.vi v10, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 2 @@ -702,37 +719,20 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v10 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: vsrl.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vx v10, v8, a5 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v10, v10, a1 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vx v10, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -791,6 +791,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVI-NEXT: vle8.v v8, (a0) +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -799,10 +800,9 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -860,6 +860,8 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVI-NEXT: vle16.v v8, (a0) +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -870,20 +872,18 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -932,6 +932,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVI-NEXT: vle32.v v8, (a0) +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v9, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 2 @@ -944,20 +946,18 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v9 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -983,8 +983,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v8, (a0) -; RVD-NEXT: vfwcvt.f.xu.v v10, v8 ; RVD-NEXT: li a1, 52 +; RVD-NEXT: vfwcvt.f.xu.v v10, v8 ; RVD-NEXT: vnsrl.wx v8, v10, a1 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -1010,50 +1010,50 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 2 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 8 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 16 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: li a1, 32 -; RV32I-NEXT: vsrl.vx v9, v8, a1 -; RV32I-NEXT: vor.vv v8, v8, v9 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: vmv.v.x v9, a1 +; RV32I-NEXT: li a1, 32 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vx v10, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1064,6 +1064,23 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 32 ; RV64I-NEXT: vsrl.vi v9, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 2 @@ -1074,37 +1091,20 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v9 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: vsrl.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vx v9, v8, a5 ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v9, v9, a1 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vx v9, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1158,6 +1158,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: li a1, 32 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVI-NEXT: vle8.v v8, (a0) +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -1166,10 +1167,9 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -1230,6 +1230,8 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVI-NEXT: vle16.v v8, (a0) +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -1240,20 +1242,18 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -1302,6 +1302,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVI: # %bb.0: ; RVI-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVI-NEXT: vle32.v v8, (a0) +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vsrl.vi v10, v8, 1 ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 2 @@ -1314,20 +1316,18 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vor.vv v8, v8, v10 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -1353,8 +1353,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v8, (a0) -; RVD-NEXT: vfwcvt.f.xu.v v12, v8 ; RVD-NEXT: li a1, 52 +; RVD-NEXT: vfwcvt.f.xu.v v12, v8 ; RVD-NEXT: vnsrl.wx v8, v12, a1 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -1380,50 +1380,50 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 2 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 8 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 16 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: li a1, 32 -; RV32I-NEXT: vsrl.vx v10, v8, a1 -; RV32I-NEXT: vor.vv v8, v8, v10 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 +; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: li a1, 32 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vx v12, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1434,6 +1434,23 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 32 ; RV64I-NEXT: vsrl.vi v10, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 2 @@ -1444,37 +1461,20 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 16 ; RV64I-NEXT: vor.vv v8, v8, v10 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: vsrl.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vx v10, v8, a5 ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v10, v10, a1 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vx v10, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index a8ae3389fb2a5..5e73e6df9170c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -13,8 +13,8 @@ define <2 x i8> @vp_ctpop_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -34,8 +34,8 @@ define <2 x i8> @vp_ctpop_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -57,8 +57,8 @@ define <4 x i8> @vp_ctpop_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -78,8 +78,8 @@ define <4 x i8> @vp_ctpop_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -101,8 +101,8 @@ define <8 x i8> @vp_ctpop_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -122,8 +122,8 @@ define <8 x i8> @vp_ctpop_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -145,8 +145,8 @@ define <16 x i8> @vp_ctpop_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -166,8 +166,8 @@ define <16 x i8> @vp_ctpop_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -190,17 +190,17 @@ define <2 x i16> @vp_ctpop_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -218,17 +218,17 @@ define <2 x i16> @vp_ctpop_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -248,17 +248,17 @@ define <4 x i16> @vp_ctpop_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -276,17 +276,17 @@ define <4 x i16> @vp_ctpop_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -306,17 +306,17 @@ define <8 x i16> @vp_ctpop_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -334,17 +334,17 @@ define <8 x i16> @vp_ctpop_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -364,17 +364,17 @@ define <16 x i16> @vp_ctpop_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -392,17 +392,17 @@ define <16 x i16> @vp_ctpop_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -422,17 +422,17 @@ define <2 x i32> @vp_ctpop_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -451,17 +451,17 @@ define <2 x i32> @vp_ctpop_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -482,17 +482,17 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -511,17 +511,17 @@ define <4 x i32> @vp_ctpop_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -542,17 +542,17 @@ define <8 x i32> @vp_ctpop_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -571,17 +571,17 @@ define <8 x i32> @vp_ctpop_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -602,17 +602,17 @@ define <16 x i32> @vp_ctpop_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -631,17 +631,17 @@ define <16 x i32> @vp_ctpop_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -663,67 +663,67 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v2i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v8, a2, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a4, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl) @@ -739,31 +739,31 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -771,34 +771,34 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v2i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -817,67 +817,67 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v4i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a2, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a4, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl) @@ -893,31 +893,31 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -925,34 +925,34 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v4i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -971,67 +971,67 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v16, v12, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v8i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a2, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vadd.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a4, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl) @@ -1047,31 +1047,31 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1079,34 +1079,34 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vand.vx v12, v12, a1 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1122,10 +1122,11 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -1144,66 +1145,41 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -1212,35 +1188,35 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; ; RV64-LABEL: vp_ctpop_v15i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a2, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a4, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) @@ -1265,28 +1241,31 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v0, (a1), zero ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v16, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v0, v8 ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vand.vv v8, v8, v16 @@ -1299,34 +1278,34 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1342,10 +1321,11 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -1364,66 +1344,41 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -1432,35 +1387,35 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; ; RV64-LABEL: vp_ctpop_v16i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a2, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a3, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a4, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl) @@ -1485,28 +1440,31 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: mv a1, sp ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v0, v8 ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vand.vv v8, v8, v16 @@ -1519,34 +1477,34 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vx v16, v16, a1 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1567,111 +1525,110 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v7, v0, 2 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB34_2 +; RV32-NEXT: bltu a0, a3, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: addi a2, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v16, v24, v8, v0.t +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 @@ -1690,36 +1647,36 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 @@ -1764,58 +1721,58 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 +; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: addiw a2, a2, 819 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v8, a2, v0.t +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a5, a1, a5 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a6, a2, a1 +; RV64-NEXT: slli a1, a3, 32 +; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: slli a2, a4, 32 +; RV64-NEXT: add a2, a4, a2 +; RV64-NEXT: addi a3, a0, -16 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vand.vx v16, v16, a5, v0.t +; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a6, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vand.vx v8, v8, a6, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 61681 -; RV64-NEXT: addiw a3, a3, -241 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: lui a4, 4112 -; RV64-NEXT: addiw a4, a4, 257 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vmul.vx v8, v8, a4, v0.t -; RV64-NEXT: li a5, 56 -; RV64-NEXT: vsrl.vx v8, v8, a5, v0.t -; RV64-NEXT: addi a6, sp, 16 -; RV64-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill -; RV64-NEXT: addi a6, a0, -16 -; RV64-NEXT: sltu a0, a0, a6 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a6 +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vmul.vx v8, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: csrr a6, vlenb -; RV64-NEXT: slli a6, a6, 3 -; RV64-NEXT: add a6, sp, a6 -; RV64-NEXT: addi a6, a6, 16 -; RV64-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 3 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v16, v16, a5, v0.t ; RV64-NEXT: vsub.vv v16, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v16, a2, v0.t +; RV64-NEXT: vand.vx v8, v16, a6, v0.t ; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a6, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vmul.vx v8, v8, a4, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vmul.vx v8, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: csrr a0, vlenb @@ -1835,88 +1792,141 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: bltu a0, a3, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: addi a2, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: vlse64.v v24, (a2), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v0, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vmv8r.v v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsub.vv v16, v8, v16 -; RV32-NEXT: vand.vv v0, v16, v24 +; RV32-NEXT: vsrl.vi v24, v24, 1 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v24, v16 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v16, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 4 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vand.vv v24, v24, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v0 +; RV32-NEXT: vmul.vv v16, v24, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v24, v8, v0 ; RV32-NEXT: li a2, 56 @@ -1925,7 +1935,8 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -1942,51 +1953,61 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: .LBB35_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a2, 209715 -; RV64-NEXT: addiw a2, a2, 819 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v8, a2 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a3, 61681 -; RV64-NEXT: addiw a3, a3, -241 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: lui a4, 4112 -; RV64-NEXT: addiw a4, a4, 257 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vmul.vx v8, v8, a4 -; RV64-NEXT: li a5, 56 -; RV64-NEXT: vsrl.vx v8, v8, a5 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: addiw a3, a3, 819 +; RV64-NEXT: addiw a4, a4, -241 +; RV64-NEXT: addiw a5, a5, 257 +; RV64-NEXT: slli a6, a2, 32 +; RV64-NEXT: add a2, a2, a6 +; RV64-NEXT: slli a6, a3, 32 +; RV64-NEXT: add a3, a3, a6 +; RV64-NEXT: slli a6, a4, 32 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: slli a6, a5, 32 +; RV64-NEXT: add a5, a5, a6 ; RV64-NEXT: addi a6, a0, -16 ; RV64-NEXT: sltu a0, a0, a6 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a0, a0, a6 +; RV64-NEXT: li a6, 56 +; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vsub.vv v8, v8, v24 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v24, v24, a2 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a2 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a3 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a2 +; RV64-NEXT: vand.vx v16, v16, a3 ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vmul.vx v16, v16, a4 -; RV64-NEXT: vsrl.vx v16, v16, a5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a5 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a6 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v16, v16, a5 +; RV64-NEXT: vsrl.vx v16, v16, a6 ; RV64-NEXT: ret %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll index b5114bbe49189..4fbe67cfcd642 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -9,11 +9,11 @@ define void @ctpop_v16i8(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a1, 85 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a1, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 @@ -44,21 +44,21 @@ define void @ctpop_v8i16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: lui a1, 5 ; CHECK-NEXT: addi a1, a1, 1365 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a1, 3 ; CHECK-NEXT: addi a1, a1, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: li a1, 257 ; CHECK-NEXT: vmul.vx v8, v8, a1 @@ -86,21 +86,21 @@ define void @ctpop_v4i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: lui a1, 349525 ; CHECK-NEXT: addi a1, a1, 1365 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a1, 209715 ; CHECK-NEXT: addi a1, a1, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: lui a1, 61681 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: lui a1, 4112 ; CHECK-NEXT: addi a1, a1, 257 @@ -133,32 +133,32 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: vand.vv v9, v10, v9 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -169,33 +169,33 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 ; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vand.vx v9, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a1, 61681 -; RV64-NEXT: addiw a1, a1, -241 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: lui a1, 4112 -; RV64-NEXT: addiw a1, a1, 257 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vmul.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsrl.vx v8, v8, a1 ; RV64-NEXT: vse64.v v8, (a0) @@ -222,11 +222,11 @@ define void @ctpop_v32i8(ptr %x, ptr %y) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: li a1, 85 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a1 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a1, 51 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 @@ -258,21 +258,21 @@ define void @ctpop_v16i16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: lui a1, 5 ; CHECK-NEXT: addi a1, a1, 1365 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a1 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a1, 3 ; CHECK-NEXT: addi a1, a1, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: li a1, 257 ; CHECK-NEXT: vmul.vx v8, v8, a1 @@ -300,21 +300,21 @@ define void @ctpop_v8i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: lui a1, 349525 ; CHECK-NEXT: addi a1, a1, 1365 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a1 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a1, 209715 ; CHECK-NEXT: addi a1, a1, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a1 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: lui a1, 61681 +; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: addi a1, a1, -241 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: lui a1, 4112 ; CHECK-NEXT: addi a1, a1, 257 @@ -439,32 +439,32 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: vand.vv v10, v12, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -475,33 +475,33 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 209715 +; RV64-NEXT: lui a3, 61681 +; RV64-NEXT: lui a4, 4112 ; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: addiw a3, a3, -241 +; RV64-NEXT: addiw a4, a4, 257 +; RV64-NEXT: slli a5, a1, 32 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a4, 32 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vand.vx v10, v8, a2 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a2 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a1, 61681 -; RV64-NEXT: addiw a1, a1, -241 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: lui a1, 4112 -; RV64-NEXT: addiw a1, a1, 257 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vmul.vx v8, v8, a1 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vmul.vx v8, v8, a4 ; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsrl.vx v8, v8, a1 ; RV64-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index d4c4ea7fee184..cd4b19f11d160 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -12,13 +12,13 @@ define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -36,14 +36,14 @@ define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -64,13 +64,13 @@ define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -88,14 +88,14 @@ define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -116,13 +116,13 @@ define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -140,14 +140,14 @@ define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -168,13 +168,13 @@ define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -192,14 +192,14 @@ define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -220,23 +220,23 @@ define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -251,24 +251,24 @@ define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -286,23 +286,23 @@ define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -317,24 +317,24 @@ define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -352,23 +352,23 @@ define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -383,24 +383,24 @@ define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -418,23 +418,23 @@ define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsub.vx v10, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -449,24 +449,24 @@ define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -484,23 +484,23 @@ define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -516,24 +516,24 @@ define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -552,23 +552,23 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -584,24 +584,24 @@ define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -620,23 +620,23 @@ define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsub.vx v10, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -652,24 +652,24 @@ define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -688,23 +688,23 @@ define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsub.vx v12, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v12, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -720,24 +720,24 @@ define <16 x i32> @vp_cttz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -756,78 +756,78 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsub.vx v9, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl) @@ -839,39 +839,39 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vnot.v v9, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vand.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -880,37 +880,37 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_cttz_v2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -926,78 +926,78 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsub.vx v10, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl) @@ -1009,39 +1009,39 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vnot.v v10, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1050,37 +1050,37 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_cttz_v4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1096,78 +1096,78 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vand.vv v12, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v12, 1, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v16, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v8, v12, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsub.vx v12, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl) @@ -1179,39 +1179,39 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vnot.v v12, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1220,37 +1220,37 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_cttz_v8i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1269,6 +1269,9 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -1285,59 +1288,60 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1350,38 +1354,38 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV64-LABEL: vp_cttz_v15i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) @@ -1393,6 +1397,9 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 24(sp) @@ -1406,36 +1413,35 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v0, (a1), zero ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v0, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1446,37 +1452,37 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_cttz_v15i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1495,6 +1501,9 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -1511,59 +1520,60 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1576,38 +1586,38 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV64-LABEL: vp_cttz_v16i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) @@ -1619,6 +1629,9 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 24(sp) @@ -1632,36 +1645,35 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v0, (a1), zero ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v0, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1672,37 +1684,37 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV64-LABEL: vp_cttz_v16i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1718,50 +1730,51 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: vslidedown.vi v7, v0, 2 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a1, 16 +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: bltu a0, a1, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: li a1, 1 +; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb @@ -1773,72 +1786,40 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a3, sp, 32 ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 @@ -1855,16 +1836,13 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -1874,23 +1852,15 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v16, v8, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t @@ -1903,7 +1873,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -1913,84 +1883,49 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsub.vx v8, v16, a1, v0.t ; RV32-NEXT: vnot.v v16, v16, v0.t ; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2002,7 +1937,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -2026,73 +1961,73 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: li a1, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a4, a0 ; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a4, 16 ; RV64-NEXT: .LBB34_2: ; RV64-NEXT: li a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: addiw a3, a3, 819 +; RV64-NEXT: addiw a7, a5, -241 +; RV64-NEXT: addiw t0, a6, 257 +; RV64-NEXT: slli a6, a2, 32 +; RV64-NEXT: add a6, a2, a6 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a5, a3, a5 +; RV64-NEXT: slli a2, a7, 32 +; RV64-NEXT: add a2, a7, a2 +; RV64-NEXT: slli a3, t0, 32 +; RV64-NEXT: add a3, t0, a3 +; RV64-NEXT: addi a7, a0, -16 +; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a7, a0, a7 +; RV64-NEXT: li a0, 56 +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a6, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a5, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: slli a7, a7, 3 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 3 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a6, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a5, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: csrr a0, vlenb @@ -2112,105 +2047,102 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb -; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: bltu a0, a3, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v8, a2 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vnot.v v0, v8 ; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vadd.vv v24, v24, v0 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a2 +; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v0, v16, a2 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v0, a2 -; RV32-NEXT: vnot.v v0, v0 -; RV32-NEXT: vand.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 1 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsub.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v16, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v16, v8 ; RV32-NEXT: vsrl.vi v16, v16, 2 ; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v0, v24 +; RV32-NEXT: vmul.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v24 +; RV32-NEXT: vmul.vv v24, v8, v0 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -2227,58 +2159,68 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: .LBB35_2: ; RV64-NEXT: li a2, 1 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vnot.v v24, v8 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 257 +; RV64-NEXT: slli a7, a3, 32 +; RV64-NEXT: add a3, a3, a7 +; RV64-NEXT: slli a7, a4, 32 +; RV64-NEXT: add a4, a4, a7 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a5, a5, a7 +; RV64-NEXT: slli a7, a6, 32 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: addi a7, a0, -16 +; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: li a7, 56 +; RV64-NEXT: vsub.vx v8, v8, a2 +; RV64-NEXT: vand.vv v8, v24, v8 ; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v24, v16, a2 ; RV64-NEXT: vnot.v v16, v16 ; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a4 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vmul.vx v16, v16, a6 +; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: ret %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v @@ -2290,13 +2232,13 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2314,14 +2256,14 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2340,13 +2282,13 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2364,14 +2306,14 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2390,13 +2332,13 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2414,14 +2356,14 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2440,13 +2382,13 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vnot.v v8, v8, v0.t ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t @@ -2464,14 +2406,14 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: li a0, 85 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 @@ -2490,23 +2432,23 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -2521,24 +2463,24 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2554,23 +2496,23 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -2585,24 +2527,24 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2618,23 +2560,23 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -2649,24 +2591,24 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2682,23 +2624,23 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vsub.vx v10, v8, a1, v0.t +; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 5 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t @@ -2713,24 +2655,24 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 1 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: li a0, 257 ; CHECK-NEXT: vmul.vx v8, v8, a0 @@ -2746,23 +2688,23 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2778,24 +2720,24 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2812,23 +2754,23 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vx v9, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t ; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2844,24 +2786,24 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vnot.v v9, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: vand.vx v9, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsrl.vi v9, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2878,23 +2820,23 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vsub.vx v10, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v10, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t ; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2910,24 +2852,24 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vnot.v v10, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2944,23 +2886,23 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vsub.vx v12, v8, a1, v0.t +; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: vnot.v v8, v8, v0.t +; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vv v8, v8, v12, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t ; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t ; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -2976,24 +2918,24 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1 -; CHECK-NEXT: vnot.v v8, v8 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vnot.v v12, v8 +; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 +; CHECK-NEXT: vand.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vsrl.vi v12, v8, 4 ; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: addi a0, a0, 257 @@ -3010,78 +2952,78 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsub.vx v9, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vx v9, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl) @@ -3093,39 +3035,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vnot.v v9, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vand.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3134,37 +3076,37 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v9, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3178,78 +3120,78 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsub.vx v10, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vx v10, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl) @@ -3261,39 +3203,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vnot.v v10, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3302,37 +3244,37 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vx v10, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3346,78 +3288,78 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vand.vv v12, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v12, 1, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsub.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v16, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v8, v12, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsub.vx v12, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vx v12, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v12, v8, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl) @@ -3429,39 +3371,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV32: # %bb.0: ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vnot.v v12, v8 +; RV32-NEXT: vsub.vx v8, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3470,37 +3412,37 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vx v12, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3517,6 +3459,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -3533,59 +3478,60 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -3598,38 +3544,38 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV64-LABEL: vp_cttz_zero_undef_v15i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl) @@ -3641,6 +3587,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 24(sp) @@ -3654,36 +3603,35 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v0, (a1), zero ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v0, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3694,37 +3642,37 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3741,6 +3689,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) @@ -3757,59 +3708,60 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: li a0, 56 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -3822,38 +3774,38 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV64-LABEL: vp_cttz_zero_undef_v16i64: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t ; RV64-NEXT: li a0, 56 +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl) @@ -3865,6 +3817,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 24(sp) @@ -3878,36 +3833,35 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v0, (a1), zero ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v0, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3918,37 +3872,37 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a4, 61681 +; RV64-NEXT: lui a5, 4112 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: addiw a0, a2, 1365 +; RV64-NEXT: addiw a1, a3, 819 +; RV64-NEXT: addiw a2, a4, -241 +; RV64-NEXT: addiw a3, a5, 257 +; RV64-NEXT: slli a4, a0, 32 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: slli a4, a2, 32 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a1 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vmul.vx v8, v8, a3 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3962,50 +3916,51 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: vslidedown.vi v7, v0, 2 ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a1, 16 +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: bltu a0, a1, .LBB70_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: ; RV32-NEXT: li a1, 1 +; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb @@ -4017,72 +3972,40 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a3, sp, 32 ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 @@ -4099,16 +4022,13 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -4118,23 +4038,15 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v16, v8, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t @@ -4147,7 +4059,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -4157,84 +4069,49 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsub.vx v8, v16, a1, v0.t ; RV32-NEXT: vnot.v v16, v16, v0.t ; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4246,7 +4123,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -4270,73 +4147,73 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: li a1, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a4, a0 ; RV64-NEXT: bltu a0, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a4, 16 ; RV64-NEXT: .LBB70_2: ; RV64-NEXT: li a1, 1 -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: lui a3, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: addiw a3, a3, 819 +; RV64-NEXT: addiw a7, a5, -241 +; RV64-NEXT: addiw t0, a6, 257 +; RV64-NEXT: slli a6, a2, 32 +; RV64-NEXT: add a6, a2, a6 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a5, a3, a5 +; RV64-NEXT: slli a2, a7, 32 +; RV64-NEXT: add a2, a7, a2 +; RV64-NEXT: slli a3, t0, 32 +; RV64-NEXT: add a3, t0, a3 +; RV64-NEXT: addi a7, a0, -16 +; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a7, a0, a7 +; RV64-NEXT: li a0, 56 +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a6, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a5, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v24 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: slli a7, a7, 3 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 3 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vand.vx v16, v16, a6, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vand.vx v16, v8, a5, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vx v8, v8, a5, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t +; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: vmul.vx v8, v8, a3, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: csrr a0, vlenb @@ -4356,105 +4233,102 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb -; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb ; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) ; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: addi a1, a2, 257 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB71_2 +; RV32-NEXT: bltu a0, a3, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v8, a2 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vnot.v v0, v8 ; RV32-NEXT: addi a3, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v0, 1 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 4 -; RV32-NEXT: vadd.vv v24, v24, v0 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a2 +; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v0, v16, a2 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v0, a2 -; RV32-NEXT: vnot.v v0, v0 -; RV32-NEXT: vand.vv v24, v0, v24 -; RV32-NEXT: vsrl.vi v0, v24, 1 -; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsub.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v16, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v0, v16, v8 ; RV32-NEXT: vsrl.vi v16, v16, 2 ; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v0, v24 +; RV32-NEXT: vmul.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v24 +; RV32-NEXT: vmul.vv v24, v8, v0 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 @@ -4471,58 +4345,68 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV64-NEXT: .LBB71_2: ; RV64-NEXT: li a2, 1 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 +; RV64-NEXT: vnot.v v24, v8 +; RV64-NEXT: lui a3, 349525 +; RV64-NEXT: lui a4, 209715 +; RV64-NEXT: lui a5, 61681 +; RV64-NEXT: lui a6, 4112 +; RV64-NEXT: addiw a3, a3, 1365 +; RV64-NEXT: addiw a4, a4, 819 +; RV64-NEXT: addiw a5, a5, -241 +; RV64-NEXT: addiw a6, a6, 257 +; RV64-NEXT: slli a7, a3, 32 +; RV64-NEXT: add a3, a3, a7 +; RV64-NEXT: slli a7, a4, 32 +; RV64-NEXT: add a4, a4, a7 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a5, a5, a7 +; RV64-NEXT: slli a7, a6, 32 +; RV64-NEXT: add a6, a6, a7 ; RV64-NEXT: addi a7, a0, -16 ; RV64-NEXT: sltu a0, a0, a7 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: li a7, 56 +; RV64-NEXT: vsub.vx v8, v8, a2 +; RV64-NEXT: vand.vv v8, v24, v8 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v8, v8, v24 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsub.vx v24, v16, a2 ; RV64-NEXT: vnot.v v16, v16 ; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vand.vx v24, v24, a3 ; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v24, v16, a4 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vmul.vx v16, v16, a6 +; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: ret %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 4b1691aada5be..57e0eeb92ee2f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -15,13 +15,13 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle8.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -36,6 +36,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVF-NEXT: vle8.v v8, (a0) +; RVF-NEXT: li a1, 127 ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v9, v8, v9 ; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -44,7 +45,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vnsrl.wi v10, v12, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v9, v10, 0 -; RVF-NEXT: li a1, 127 ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vsub.vx v8, v9, a1 ; RVF-NEXT: vmerge.vim v8, v8, 8, v0 @@ -55,6 +55,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RVD-NEXT: vle8.v v8, (a0) +; RVD-NEXT: li a1, 127 ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v9, v8, v9 ; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -63,7 +64,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vnsrl.wi v10, v12, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v9, v10, 0 -; RVD-NEXT: li a1, 127 ; RVD-NEXT: vmseq.vi v0, v8, 0 ; RVD-NEXT: vsub.vx v8, v9, a1 ; RVD-NEXT: vmerge.vim v8, v8, 8, v0 @@ -92,23 +92,23 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle16.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -120,15 +120,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVF-NEXT: vle16.v v8, (a0) -; RVF-NEXT: vrsub.vi v9, v8, 0 -; RVF-NEXT: vand.vv v9, v8, v9 -; RVF-NEXT: vfwcvt.f.xu.v v10, v9 -; RVF-NEXT: vnsrl.wi v9, v10, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vsub.vx v9, v9, a1 +; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vand.vv v8, v8, v9 +; RVF-NEXT: vfwcvt.f.xu.v v10, v8 +; RVF-NEXT: vnsrl.wi v8, v10, 23 +; RVF-NEXT: vsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 16 -; RVF-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; @@ -136,15 +136,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVD-NEXT: vle16.v v8, (a0) -; RVD-NEXT: vrsub.vi v9, v8, 0 -; RVD-NEXT: vand.vv v9, v8, v9 -; RVD-NEXT: vfwcvt.f.xu.v v10, v9 -; RVD-NEXT: vnsrl.wi v9, v10, 23 ; RVD-NEXT: li a1, 127 -; RVD-NEXT: vsub.vx v9, v9, a1 +; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vand.vv v8, v8, v9 +; RVD-NEXT: vfwcvt.f.xu.v v10, v8 +; RVD-NEXT: vnsrl.wi v8, v10, 23 +; RVD-NEXT: vsub.vx v8, v8, a1 ; RVD-NEXT: li a1, 16 -; RVD-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; @@ -170,23 +170,23 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle32.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -199,17 +199,17 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle32.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v9, v8, v9 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v9, v9 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v9, v9, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vsub.vx v9, v9, a1 ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vsrl.vi v8, v9, 23 +; RVF-NEXT: vsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 32 -; RVF-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; @@ -217,16 +217,16 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v8, (a0) +; RVD-NEXT: li a1, 52 ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v9, v8, v9 ; RVD-NEXT: vfwcvt.f.xu.v v10, v9 -; RVD-NEXT: li a1, 52 ; RVD-NEXT: vnsrl.wx v9, v10, a1 ; RVD-NEXT: li a1, 1023 -; RVD-NEXT: vsub.vx v9, v9, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vsub.vx v8, v9, a1 ; RVD-NEXT: li a1, 32 -; RVD-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; @@ -250,40 +250,40 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v9, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: vmv.v.x v9, a1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vsub.vx v10, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -294,37 +294,37 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v9, v8, a1 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 1 +; RV64I-NEXT: vsub.vx v9, v8, a5 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v9, v9, a1 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vx v9, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -334,19 +334,21 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v9, v8, v9 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RVF-NEXT: vfncvt.f.xu.w v10, v9 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v9, v10, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vwsubu.vx v10, v9, a1 ; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RVF-NEXT: vsrl.vi v8, v10, 23 +; RVF-NEXT: vwsubu.vx v9, v8, a1 ; RVF-NEXT: li a1, 64 -; RVF-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RVF-NEXT: vmerge.vxm v8, v9, a1, v0 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; @@ -354,18 +356,18 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVD-NEXT: vle64.v v8, (a0) +; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v9, v8, v9 -; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vfcvt.f.xu.v v9, v9 ; RVD-NEXT: fsrm a1 ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vsrl.vx v9, v9, a1 ; RVD-NEXT: li a1, 1023 -; RVD-NEXT: vsub.vx v9, v9, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vsub.vx v8, v9, a1 ; RVD-NEXT: li a1, 64 -; RVD-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; @@ -392,13 +394,13 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle8.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -414,6 +416,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: li a1, 32 ; RVF-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVF-NEXT: vle8.v v8, (a0) +; RVF-NEXT: li a1, 127 ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v10, v8, v10 ; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -422,7 +425,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vnsrl.wi v12, v16, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v10, v12, 0 -; RVF-NEXT: li a1, 127 ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vsub.vx v8, v10, a1 ; RVF-NEXT: vmerge.vim v8, v8, 8, v0 @@ -434,6 +436,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVD-NEXT: vle8.v v8, (a0) +; RVD-NEXT: li a1, 127 ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v10 ; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -442,7 +445,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vnsrl.wi v12, v16, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v10, v12, 0 -; RVD-NEXT: li a1, 127 ; RVD-NEXT: vmseq.vi v0, v8, 0 ; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: vmerge.vim v8, v8, 8, v0 @@ -472,23 +474,23 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle16.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -500,15 +502,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle16.v v8, (a0) -; RVF-NEXT: vrsub.vi v10, v8, 0 -; RVF-NEXT: vand.vv v10, v8, v10 -; RVF-NEXT: vfwcvt.f.xu.v v12, v10 -; RVF-NEXT: vnsrl.wi v10, v12, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vsub.vx v10, v10, a1 +; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vand.vv v8, v8, v10 +; RVF-NEXT: vfwcvt.f.xu.v v12, v8 +; RVF-NEXT: vnsrl.wi v8, v12, 23 +; RVF-NEXT: vsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 16 -; RVF-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse16.v v8, (a0) ; RVF-NEXT: ret ; @@ -516,15 +518,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle16.v v8, (a0) -; RVD-NEXT: vrsub.vi v10, v8, 0 -; RVD-NEXT: vand.vv v10, v8, v10 -; RVD-NEXT: vfwcvt.f.xu.v v12, v10 -; RVD-NEXT: vnsrl.wi v10, v12, 23 ; RVD-NEXT: li a1, 127 -; RVD-NEXT: vsub.vx v10, v10, a1 +; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vand.vv v8, v8, v10 +; RVD-NEXT: vfwcvt.f.xu.v v12, v8 +; RVD-NEXT: vnsrl.wi v8, v12, 23 +; RVD-NEXT: vsub.vx v8, v8, a1 ; RVD-NEXT: li a1, 16 -; RVD-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse16.v v8, (a0) ; RVD-NEXT: ret ; @@ -550,23 +552,23 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle32.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -579,17 +581,17 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVF-NEXT: vle32.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v10, v8, v10 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v10, v10 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v10, v10, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vsub.vx v10, v10, a1 ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vsrl.vi v8, v10, 23 +; RVF-NEXT: vsub.vx v8, v8, a1 ; RVF-NEXT: li a1, 32 -; RVF-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse32.v v8, (a0) ; RVF-NEXT: ret ; @@ -597,16 +599,16 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v8, (a0) +; RVD-NEXT: li a1, 52 ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v10 ; RVD-NEXT: vfwcvt.f.xu.v v12, v10 -; RVD-NEXT: li a1, 52 ; RVD-NEXT: vnsrl.wx v10, v12, a1 ; RVD-NEXT: li a1, 1023 -; RVD-NEXT: vsub.vx v10, v10, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 32 -; RVD-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse32.v v8, (a0) ; RVD-NEXT: ret ; @@ -630,40 +632,40 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v10, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 +; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vsub.vx v12, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -674,37 +676,37 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v10, v8, a1 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 1 +; RV64I-NEXT: vsub.vx v10, v8, a5 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v10, v10, a1 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vx v10, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -714,19 +716,21 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v10, v8, v10 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RVF-NEXT: vfncvt.f.xu.w v12, v10 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v10, v12, 23 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vwsubu.vx v12, v10, a1 ; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RVF-NEXT: vmseq.vi v0, v8, 0 +; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVF-NEXT: vsrl.vi v8, v12, 23 +; RVF-NEXT: vwsubu.vx v10, v8, a1 ; RVF-NEXT: li a1, 64 -; RVF-NEXT: vmerge.vxm v8, v12, a1, v0 +; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vmerge.vxm v8, v10, a1, v0 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; @@ -734,18 +738,18 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVD-NEXT: vle64.v v8, (a0) +; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v10 -; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vfcvt.f.xu.v v10, v10 ; RVD-NEXT: fsrm a1 ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vsrl.vx v10, v10, a1 ; RVD-NEXT: li a1, 1023 -; RVD-NEXT: vsub.vx v10, v10, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 +; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 64 -; RVD-NEXT: vmerge.vxm v8, v10, a1, v0 +; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse64.v v8, (a0) ; RVD-NEXT: ret ; @@ -771,13 +775,13 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle8.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -843,23 +847,23 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle16.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -914,23 +918,23 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle32.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v9, v8, a1 +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v9 ; RVI-NEXT: vsrl.vi v9, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v9, v9, a1 -; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v9 ; RVI-NEXT: vand.vx v9, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v9, v8 ; RVI-NEXT: vsrl.vi v9, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v9 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -943,9 +947,9 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle32.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v8, v8, v9 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v8, v8 ; RVF-NEXT: fsrm a1 ; RVF-NEXT: vsrl.vi v8, v8, 23 @@ -958,10 +962,10 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVD-NEXT: vle32.v v8, (a0) +; RVD-NEXT: li a1, 52 ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v8, v8, v9 ; RVD-NEXT: vfwcvt.f.xu.v v10, v8 -; RVD-NEXT: li a1, 52 ; RVD-NEXT: vnsrl.wx v8, v10, a1 ; RVD-NEXT: li a1, 1023 ; RVD-NEXT: vsub.vx v8, v8, a1 @@ -987,40 +991,40 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v9, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: vmv.v.x v9, a1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vsub.vx v10, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1031,37 +1035,37 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v9, v8, a1 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 1 +; RV64I-NEXT: vsub.vx v9, v8, a5 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v9 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v9, v9, a1 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vx v9, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1071,9 +1075,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v8, v8, v9 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RVF-NEXT: vfncvt.f.xu.w v9, v8 ; RVF-NEXT: fsrm a1 @@ -1087,9 +1091,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVD-NEXT: vle64.v v8, (a0) +; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v8, v8, v9 -; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vfcvt.f.xu.v v8, v8 ; RVD-NEXT: fsrm a1 ; RVD-NEXT: li a1, 52 @@ -1121,13 +1125,13 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle8.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: li a1, 85 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: li a1, 51 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 @@ -1196,23 +1200,23 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle16.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: lui a1, 5 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 5 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 3 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 1 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 1 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: li a1, 257 ; RVI-NEXT: vmul.vx v8, v8, a1 @@ -1267,23 +1271,23 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVI-NEXT: vle32.v v8, (a0) ; RVI-NEXT: li a1, 1 ; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: lui a1, 349525 +; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 ; RVI-NEXT: vsrl.vi v10, v8, 1 -; RVI-NEXT: lui a1, 349525 -; RVI-NEXT: addi a1, a1, 1365 ; RVI-NEXT: vand.vx v10, v10, a1 -; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: lui a1, 209715 ; RVI-NEXT: addi a1, a1, 819 +; RVI-NEXT: vsub.vv v8, v8, v10 ; RVI-NEXT: vand.vx v10, v8, a1 ; RVI-NEXT: vsrl.vi v8, v8, 2 ; RVI-NEXT: vand.vx v8, v8, a1 +; RVI-NEXT: lui a1, 61681 +; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vadd.vv v8, v10, v8 ; RVI-NEXT: vsrl.vi v10, v8, 4 ; RVI-NEXT: vadd.vv v8, v8, v10 -; RVI-NEXT: lui a1, 61681 -; RVI-NEXT: addi a1, a1, -241 ; RVI-NEXT: vand.vx v8, v8, a1 ; RVI-NEXT: lui a1, 4112 ; RVI-NEXT: addi a1, a1, 257 @@ -1296,9 +1300,9 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVF-NEXT: vle32.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v8, v8, v10 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vfcvt.f.xu.v v8, v8 ; RVF-NEXT: fsrm a1 ; RVF-NEXT: vsrl.vi v8, v8, 23 @@ -1311,10 +1315,10 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RVD-NEXT: vle32.v v8, (a0) +; RVD-NEXT: li a1, 52 ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v8, v8, v10 ; RVD-NEXT: vfwcvt.f.xu.v v12, v8 -; RVD-NEXT: li a1, 52 ; RVD-NEXT: vnsrl.wx v8, v12, a1 ; RVD-NEXT: li a1, 1023 ; RVD-NEXT: vsub.vx v8, v8, a1 @@ -1340,40 +1344,40 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v10, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 ; RV32I-NEXT: lui a1, 349525 ; RV32I-NEXT: addi a1, a1, 1365 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 +; RV32I-NEXT: vmv.v.x v10, a1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vsub.vx v12, v8, a1 ; RV32I-NEXT: lui a1, 209715 ; RV32I-NEXT: addi a1, a1, 819 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: lui a1, 4112 ; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1384,37 +1388,37 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v10, v8, a1 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 +; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: addiw a4, a4, 257 +; RV64I-NEXT: slli a5, a1, 32 +; RV64I-NEXT: add a1, a1, a5 +; RV64I-NEXT: slli a5, a2, 32 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a3, 32 +; RV64I-NEXT: add a3, a3, a5 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: li a5, 1 +; RV64I-NEXT: vsub.vx v10, v8, a5 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vand.vv v8, v8, v10 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v10, v10, a1 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vx v10, v8, a2 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a2 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vand.vx v8, v8, a3 +; RV64I-NEXT: vmul.vx v8, v8, a4 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1424,9 +1428,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) +; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v8, v8, v10 -; RVF-NEXT: fsrmi a1, 1 ; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RVF-NEXT: vfncvt.f.xu.w v10, v8 ; RVF-NEXT: fsrm a1 @@ -1440,9 +1444,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVD: # %bb.0: ; RVD-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RVD-NEXT: vle64.v v8, (a0) +; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v8, v8, v10 -; RVD-NEXT: fsrmi a1, 1 ; RVD-NEXT: vfcvt.f.xu.v v8, v8 ; RVD-NEXT: fsrm a1 ; RVD-NEXT: li a1, 52 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 060a5c4224fe1..ae5dbfa4bf30b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -10,30 +10,33 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: li a0, -256 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vmerge.vim v10, v9, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v12, v9, 1, v0 -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vadd.vv v11, v9, v9 -; CHECK-NEXT: li a0, -256 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vadd.vi v9, v11, -16 -; CHECK-NEXT: vrgather.vv v8, v10, v9, v0.t -; CHECK-NEXT: vmsne.vi v9, v8, 0 -; CHECK-NEXT: vnsrl.wi v8, v12, 8 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vadd.vi v12, v11, -16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vadd.vi v11, v11, -15 -; CHECK-NEXT: vrgather.vv v8, v10, v11, v0.t -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vim v13, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v14, 0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v8, v13, v12, v0.t +; CHECK-NEXT: vnsrl.wi v12, v14, 8 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vv v12, v13, v11, v0.t +; CHECK-NEXT: vmsne.vi v8, v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %vec = load <32 x i1>, ptr %p %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll index 04ebc7ca6b2b8..e13f4f4b50b0f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll @@ -32,20 +32,20 @@ define void @add_v2i64(ptr %x, ptr %y) { ; RV32-NEXT: lw a5, 4(a0) ; RV32-NEXT: lw a6, 8(a0) ; RV32-NEXT: lw a7, 12(a0) -; RV32-NEXT: lw t0, 8(a1) -; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: lw t0, 12(a1) +; RV32-NEXT: lw a1, 8(a1) ; RV32-NEXT: add a3, a5, a3 ; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: add a1, a6, a1 ; RV32-NEXT: sltu a4, a2, a4 +; RV32-NEXT: sltu a5, a1, a6 ; RV32-NEXT: add a3, a3, a4 -; RV32-NEXT: add a1, a7, a1 -; RV32-NEXT: add t0, a6, t0 -; RV32-NEXT: sltu a4, t0, a6 -; RV32-NEXT: add a1, a1, a4 +; RV32-NEXT: add a5, a7, a5 ; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a3, 4(a0) -; RV32-NEXT: sw t0, 8(a0) -; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: sw a1, 8(a0) +; RV32-NEXT: sw a5, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll index 493481ad129d2..e53876d69b59b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll @@ -325,20 +325,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: andi a1, a1, 255 -; RV32-NEXT: li a2, 128 -; RV32-NEXT: addi a3, a0, 128 -; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV32-NEXT: vle8.v v16, (a3) -; RV32-NEXT: vle8.v v24, (a0) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: li a3, 128 +; RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV32-NEXT: vle8.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle8.v v16, (a0) +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vmseq.vi v0, v8, 0 +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vmseq.vi v8, v16, 0 -; RV32-NEXT: vmseq.vi v0, v24, 0 -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: vmerge.vim v24, v16, 1, v0 -; RV32-NEXT: vse8.v v24, (a0) +; RV32-NEXT: vmerge.vim v16, v24, 1, v0 +; RV32-NEXT: vse8.v v16, (a2) ; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v8, v16, 1, v0 +; RV32-NEXT: vmerge.vim v8, v24, 1, v0 ; RV32-NEXT: addi a0, sp, 128 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: lbu a0, 0(a1) @@ -356,20 +356,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: addi s0, sp, 384 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: andi a1, a1, 255 -; RV64-NEXT: li a2, 128 -; RV64-NEXT: addi a3, a0, 128 -; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64-NEXT: vle8.v v16, (a3) -; RV64-NEXT: vle8.v v24, (a0) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: mv a2, sp +; RV64-NEXT: li a3, 128 +; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64-NEXT: vle8.v v8, (a0) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vle8.v v16, (a0) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: vmseq.vi v0, v8, 0 +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vmseq.vi v0, v24, 0 -; RV64-NEXT: vmv.v.i v16, 0 -; RV64-NEXT: vmerge.vim v24, v16, 1, v0 -; RV64-NEXT: vse8.v v24, (a0) +; RV64-NEXT: vmerge.vim v16, v24, 1, v0 +; RV64-NEXT: vse8.v v16, (a2) ; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vmerge.vim v8, v16, 1, v0 +; RV64-NEXT: vmerge.vim v8, v24, 1, v0 ; RV64-NEXT: addi a0, sp, 128 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: lbu a0, 0(a1) @@ -387,20 +387,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32ZBS-NEXT: addi s0, sp, 384 ; RV32ZBS-NEXT: andi sp, sp, -128 ; RV32ZBS-NEXT: andi a1, a1, 255 -; RV32ZBS-NEXT: li a2, 128 -; RV32ZBS-NEXT: addi a3, a0, 128 -; RV32ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV32ZBS-NEXT: vle8.v v16, (a3) -; RV32ZBS-NEXT: vle8.v v24, (a0) -; RV32ZBS-NEXT: mv a0, sp -; RV32ZBS-NEXT: add a1, a0, a1 +; RV32ZBS-NEXT: mv a2, sp +; RV32ZBS-NEXT: li a3, 128 +; RV32ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV32ZBS-NEXT: vle8.v v8, (a0) +; RV32ZBS-NEXT: addi a0, a0, 128 +; RV32ZBS-NEXT: vle8.v v16, (a0) +; RV32ZBS-NEXT: add a1, a2, a1 +; RV32ZBS-NEXT: vmseq.vi v0, v8, 0 +; RV32ZBS-NEXT: vmv.v.i v24, 0 ; RV32ZBS-NEXT: vmseq.vi v8, v16, 0 -; RV32ZBS-NEXT: vmseq.vi v0, v24, 0 -; RV32ZBS-NEXT: vmv.v.i v16, 0 -; RV32ZBS-NEXT: vmerge.vim v24, v16, 1, v0 -; RV32ZBS-NEXT: vse8.v v24, (a0) +; RV32ZBS-NEXT: vmerge.vim v16, v24, 1, v0 +; RV32ZBS-NEXT: vse8.v v16, (a2) ; RV32ZBS-NEXT: vmv1r.v v0, v8 -; RV32ZBS-NEXT: vmerge.vim v8, v16, 1, v0 +; RV32ZBS-NEXT: vmerge.vim v8, v24, 1, v0 ; RV32ZBS-NEXT: addi a0, sp, 128 ; RV32ZBS-NEXT: vse8.v v8, (a0) ; RV32ZBS-NEXT: lbu a0, 0(a1) @@ -418,20 +418,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64ZBS-NEXT: addi s0, sp, 384 ; RV64ZBS-NEXT: andi sp, sp, -128 ; RV64ZBS-NEXT: andi a1, a1, 255 -; RV64ZBS-NEXT: li a2, 128 -; RV64ZBS-NEXT: addi a3, a0, 128 -; RV64ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; RV64ZBS-NEXT: vle8.v v16, (a3) -; RV64ZBS-NEXT: vle8.v v24, (a0) -; RV64ZBS-NEXT: mv a0, sp -; RV64ZBS-NEXT: add a1, a0, a1 +; RV64ZBS-NEXT: mv a2, sp +; RV64ZBS-NEXT: li a3, 128 +; RV64ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64ZBS-NEXT: vle8.v v8, (a0) +; RV64ZBS-NEXT: addi a0, a0, 128 +; RV64ZBS-NEXT: vle8.v v16, (a0) +; RV64ZBS-NEXT: add a1, a2, a1 +; RV64ZBS-NEXT: vmseq.vi v0, v8, 0 +; RV64ZBS-NEXT: vmv.v.i v24, 0 ; RV64ZBS-NEXT: vmseq.vi v8, v16, 0 -; RV64ZBS-NEXT: vmseq.vi v0, v24, 0 -; RV64ZBS-NEXT: vmv.v.i v16, 0 -; RV64ZBS-NEXT: vmerge.vim v24, v16, 1, v0 -; RV64ZBS-NEXT: vse8.v v24, (a0) +; RV64ZBS-NEXT: vmerge.vim v16, v24, 1, v0 +; RV64ZBS-NEXT: vse8.v v16, (a2) ; RV64ZBS-NEXT: vmv1r.v v0, v8 -; RV64ZBS-NEXT: vmerge.vim v8, v16, 1, v0 +; RV64ZBS-NEXT: vmerge.vim v8, v24, 1, v0 ; RV64ZBS-NEXT: addi a0, sp, 128 ; RV64ZBS-NEXT: vse8.v v8, (a0) ; RV64ZBS-NEXT: lbu a0, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll index fdee80fb95627..e9dca2c42e835 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll @@ -607,9 +607,9 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) { ; VLA-NEXT: li a2, 64 ; VLA-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; VLA-NEXT: vlm.v v0, (a0) +; VLA-NEXT: li a0, 42 ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 -; VLA-NEXT: li a0, 42 ; VLA-NEXT: vsetivli zero, 2, e8, m4, ta, ma ; VLA-NEXT: vslidedown.vx v8, v8, a0 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma @@ -764,8 +764,8 @@ define void @extract_v2i1_nxv64i1_42( %x, ptr %y) { ; VLA: # %bb.0: ; VLA-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 ; VLA-NEXT: li a1, 42 +; VLA-NEXT: vmerge.vim v8, v8, 1, v0 ; VLA-NEXT: vsetivli zero, 2, e8, m4, ta, ma ; VLA-NEXT: vslidedown.vx v8, v8, a1 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index cb830d668d2e8..a193d4e4e689f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -177,8 +177,8 @@ define i64 @extractelt_v4i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vsrl.vx v10, v8, a0 ; RV32-NEXT: vmv.x.s a1, v10 @@ -273,8 +273,8 @@ define i64 @extractelt_v3i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 4 -; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v8, v8, 5 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -426,12 +426,12 @@ define i64 @extractelt_v2i64_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a2, 32 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vslidedown.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -575,12 +575,12 @@ define i64 @extractelt_v4i64_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a2, 32 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vslidedown.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -686,14 +686,14 @@ define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 3, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: add a1, a1, a1 +; RV32-NEXT: addi a0, a1, 1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: add a1, a1, a1 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vx v10, v8, a1 +; RV32-NEXT: vslidedown.vx v8, v8, a0 ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: addi a1, a1, 1 -; RV32-NEXT: vslidedown.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -827,19 +827,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: andi a1, a1, 63 -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: addi a3, a0, 128 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a3) +; RV32-NEXT: mv a2, sp +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle32.v v16, (a0) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: addi a0, sp, 128 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: vadd.vv v16, v16, v16 +; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vse32.v v8, (a2) ; RV32-NEXT: vse32.v v16, (a0) -; RV32-NEXT: addi a0, sp, 128 -; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: lw a0, 0(a1) ; RV32-NEXT: addi sp, s0, -384 ; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload @@ -855,19 +855,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV64-NEXT: addi s0, sp, 384 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: andi a1, a1, 63 -; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: li a2, 32 -; RV64-NEXT: addi a3, a0, 128 -; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a3) +; RV64-NEXT: mv a2, sp +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle32.v v16, (a0) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: addi a0, sp, 128 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: vadd.vv v16, v16, v16 +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vse32.v v8, (a2) ; RV64-NEXT: vse32.v v16, (a0) -; RV64-NEXT: addi a0, sp, 128 -; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: lw a0, 0(a1) ; RV64-NEXT: addi sp, s0, -384 ; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload @@ -931,14 +931,14 @@ define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sw a2, 0(a1) -; RV32-NEXT: sw a0, 4(a1) +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a2, v9 +; RV32-NEXT: sw a0, 0(a1) +; RV32-NEXT: sw a2, 4(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: store_extractelt_v2i64: @@ -1062,17 +1062,17 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI46_0) ; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32NOM-NEXT: vle32.v v9, (a0) -; RV32NOM-NEXT: vmulh.vv v9, v8, v9 ; RV32NOM-NEXT: lui a0, 1044480 ; RV32NOM-NEXT: vmv.s.x v10, a0 -; RV32NOM-NEXT: vsext.vf4 v11, v10 -; RV32NOM-NEXT: vand.vv v8, v8, v11 -; RV32NOM-NEXT: vadd.vv v8, v9, v8 ; RV32NOM-NEXT: lui a0, 12320 ; RV32NOM-NEXT: addi a0, a0, 257 +; RV32NOM-NEXT: vsext.vf4 v11, v10 +; RV32NOM-NEXT: vand.vv v10, v8, v11 +; RV32NOM-NEXT: vmulh.vv v8, v8, v9 ; RV32NOM-NEXT: vmv.s.x v9, a0 -; RV32NOM-NEXT: vsext.vf4 v10, v9 -; RV32NOM-NEXT: vsra.vv v9, v8, v10 +; RV32NOM-NEXT: vsext.vf4 v11, v9 +; RV32NOM-NEXT: vadd.vv v8, v8, v10 +; RV32NOM-NEXT: vsra.vv v9, v8, v11 ; RV32NOM-NEXT: vsrl.vi v8, v8, 31 ; RV32NOM-NEXT: vadd.vv v8, v9, v8 ; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 @@ -1083,10 +1083,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV32M: # %bb.0: ; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32M-NEXT: vslidedown.vi v8, v8, 2 -; RV32M-NEXT: vmv.x.s a0, v8 -; RV32M-NEXT: lui a1, 322639 -; RV32M-NEXT: addi a1, a1, -945 -; RV32M-NEXT: mulh a0, a0, a1 +; RV32M-NEXT: lui a0, 322639 +; RV32M-NEXT: vmv.x.s a1, v8 +; RV32M-NEXT: addi a0, a0, -945 +; RV32M-NEXT: mulh a0, a1, a0 ; RV32M-NEXT: srli a1, a0, 31 ; RV32M-NEXT: srai a0, a0, 2 ; RV32M-NEXT: add a0, a0, a1 @@ -1098,15 +1098,15 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI46_0) ; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64NOM-NEXT: vle32.v v9, (a0) -; RV64NOM-NEXT: vmulh.vv v9, v8, v9 ; RV64NOM-NEXT: lui a0, 1044480 ; RV64NOM-NEXT: vmv.s.x v10, a0 -; RV64NOM-NEXT: vsext.vf4 v11, v10 -; RV64NOM-NEXT: vand.vv v8, v8, v11 -; RV64NOM-NEXT: vadd.vv v8, v9, v8 ; RV64NOM-NEXT: lui a0, 12320 ; RV64NOM-NEXT: addi a0, a0, 257 +; RV64NOM-NEXT: vsext.vf4 v11, v10 +; RV64NOM-NEXT: vand.vv v10, v8, v11 +; RV64NOM-NEXT: vmulh.vv v8, v8, v9 ; RV64NOM-NEXT: vmv.s.x v9, a0 +; RV64NOM-NEXT: vadd.vv v8, v8, v10 ; RV64NOM-NEXT: vsext.vf4 v10, v9 ; RV64NOM-NEXT: vsra.vv v8, v8, v10 ; RV64NOM-NEXT: vsrl.vi v9, v8, 31 @@ -1119,10 +1119,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV64M: # %bb.0: ; RV64M-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64M-NEXT: vslidedown.vi v8, v8, 2 -; RV64M-NEXT: vmv.x.s a0, v8 -; RV64M-NEXT: lui a1, 322639 -; RV64M-NEXT: addiw a1, a1, -945 -; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: lui a0, 322639 +; RV64M-NEXT: vmv.x.s a1, v8 +; RV64M-NEXT: addiw a0, a0, -945 +; RV64M-NEXT: mul a0, a1, a0 ; RV64M-NEXT: srli a1, a0, 63 ; RV64M-NEXT: srai a0, a0, 34 ; RV64M-NEXT: add a0, a0, a1 @@ -1149,10 +1149,10 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) { ; RV32M: # %bb.0: ; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32M-NEXT: vslidedown.vi v8, v8, 2 -; RV32M-NEXT: vmv.x.s a0, v8 -; RV32M-NEXT: lui a1, 322639 -; RV32M-NEXT: addi a1, a1, -945 -; RV32M-NEXT: mulhu a0, a0, a1 +; RV32M-NEXT: lui a0, 322639 +; RV32M-NEXT: vmv.x.s a1, v8 +; RV32M-NEXT: addi a0, a0, -945 +; RV32M-NEXT: mulhu a0, a1, a0 ; RV32M-NEXT: srli a0, a0, 2 ; RV32M-NEXT: ret ; @@ -1172,11 +1172,11 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) { ; RV64M-LABEL: extractelt_udiv_v4i32: ; RV64M: # %bb.0: ; RV64M-NEXT: lui a0, 322639 -; RV64M-NEXT: addi a0, a0, -945 -; RV64M-NEXT: slli a0, a0, 32 ; RV64M-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64M-NEXT: vslidedown.vi v8, v8, 2 +; RV64M-NEXT: addi a0, a0, -945 ; RV64M-NEXT: vmv.x.s a1, v8 +; RV64M-NEXT: slli a0, a0, 32 ; RV64M-NEXT: slli a1, a1, 32 ; RV64M-NEXT: mulhu a0, a1, a0 ; RV64M-NEXT: srli a0, a0, 34 @@ -1191,8 +1191,8 @@ define float @extractelt_fadd_v4f32(<4 x float> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 267520 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fadd.s fa0, fa5, fa4 ; CHECK-NEXT: ret @@ -1206,8 +1206,8 @@ define float @extractelt_fsub_v4f32(<4 x float> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 267520 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fsub.s fa0, fa4, fa5 ; CHECK-NEXT: ret @@ -1221,8 +1221,8 @@ define float @extractelt_fmul_v4f32(<4 x float> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 267520 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fmul.s fa0, fa5, fa4 ; CHECK-NEXT: ret @@ -1236,8 +1236,8 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: lui a0, 267520 +; CHECK-NEXT: vfmv.f.s fa5, v8 ; CHECK-NEXT: fmv.w.x fa4, a0 ; CHECK-NEXT: fdiv.s fa0, fa5, fa4 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll index 84895715e814f..ab2d00b9b9137 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll @@ -123,10 +123,10 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: ceil_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -148,10 +148,10 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -171,10 +171,10 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -194,10 +194,10 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -217,10 +217,10 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -240,10 +240,10 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll index 3c99870dba950..c6ce7c1bbe8b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll @@ -123,10 +123,10 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: floor_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -148,10 +148,10 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -171,10 +171,10 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -194,10 +194,10 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -217,10 +217,10 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -240,10 +240,10 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index f86fc50998353..02e99ea513e69 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -32,10 +32,10 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -72,10 +72,10 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -114,10 +114,10 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -154,10 +154,10 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -197,10 +197,10 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vmv1r.v v9, v0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v9, v12, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -238,10 +238,10 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.floor.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -283,10 +283,10 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vmv1r.v v10, v0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v10, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 2 @@ -324,10 +324,10 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.floor.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.floor.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.floor.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.floor.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -808,27 +818,30 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: fsrmi a2, 2 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a1, 2 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: fsrm a2 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index cefb246f3821a..9a3838d57a0b0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -63,11 +63,9 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -135,11 +133,9 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 @@ -209,11 +205,9 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -285,11 +279,9 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 ; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -610,56 +602,72 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a3, 24 ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB24_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB24_2: -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vmfeq.vv v26, v16, v16, v0.t +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmax.vv v8, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill @@ -669,7 +677,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -677,8 +685,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmfeq.vv v25, v16, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -688,12 +695,13 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v16, v8, v0.t +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -718,18 +726,21 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB25_2 ; CHECK-NEXT: # %bb.1: @@ -738,36 +749,25 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -776,10 +776,7 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 ; CHECK-NEXT: vfmax.vv v16, v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll index ae592119cf881..900e02876cbe1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll @@ -315,21 +315,19 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 -; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v10, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8 +; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v11, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -356,14 +354,12 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 35bd8b7c17e6d..4a7f888fbced4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -63,11 +63,9 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -135,11 +133,9 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 @@ -209,11 +205,9 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -285,11 +279,9 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 ; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -610,56 +602,72 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a3, 24 ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB24_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB24_2: -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vmfeq.vv v26, v16, v16, v0.t +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmin.vv v8, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill @@ -669,7 +677,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -677,8 +685,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmfeq.vv v25, v16, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -688,12 +695,13 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v16, v8, v0.t +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -718,18 +726,21 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB25_2 ; CHECK-NEXT: # %bb.1: @@ -738,36 +749,25 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -776,10 +776,7 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 ; CHECK-NEXT: vfmin.vv v16, v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll index 8e042fc0785e1..db970c89d935c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll @@ -315,21 +315,19 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 -; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v10, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8 +; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v11, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -356,14 +354,12 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v9, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll index 0b9fabb832e29..3a7ded1537ef6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll @@ -106,10 +106,10 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI4_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -132,10 +132,10 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -156,10 +156,10 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -180,10 +180,10 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -204,10 +204,10 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 10de74824548c..b0f8bc9dcc6bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -42,11 +42,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vmul.vx v14, v12, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vadd.vi v8, v14, -14 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t @@ -252,8 +252,8 @@ define dso_local void @splat_load_licm(ptr %0) { ; RV32-LABEL: splat_load_licm: ; RV32: # %bb.0: ; RV32-NEXT: lui a1, 1 -; RV32-NEXT: add a1, a0, a1 ; RV32-NEXT: lui a2, 263168 +; RV32-NEXT: add a1, a0, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 @@ -266,8 +266,8 @@ define dso_local void @splat_load_licm(ptr %0) { ; RV64V-LABEL: splat_load_licm: ; RV64V: # %bb.0: ; RV64V-NEXT: lui a1, 1 -; RV64V-NEXT: add a1, a0, a1 ; RV64V-NEXT: lui a2, 263168 +; RV64V-NEXT: add a1, a0, a1 ; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 ; RV64V-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 @@ -280,8 +280,8 @@ define dso_local void @splat_load_licm(ptr %0) { ; RVA22U64-LABEL: splat_load_licm: ; RVA22U64: # %bb.0: ; RVA22U64-NEXT: lui a1, 1 -; RVA22U64-NEXT: add a1, a1, a0 ; RVA22U64-NEXT: lui a2, 263168 +; RVA22U64-NEXT: add a1, a1, a0 ; RVA22U64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVA22U64-NEXT: vmv.v.x v8, a2 ; RVA22U64-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 @@ -294,8 +294,8 @@ define dso_local void @splat_load_licm(ptr %0) { ; RV64ZVFHMIN-LABEL: splat_load_licm: ; RV64ZVFHMIN: # %bb.0: ; RV64ZVFHMIN-NEXT: lui a1, 1 -; RV64ZVFHMIN-NEXT: add a1, a0, a1 ; RV64ZVFHMIN-NEXT: lui a2, 263168 +; RV64ZVFHMIN-NEXT: add a1, a0, a1 ; RV64ZVFHMIN-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64ZVFHMIN-NEXT: vmv.v.x v8, a2 ; RV64ZVFHMIN-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 @@ -593,22 +593,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3, ; RV32-NEXT: flw fs1, 52(s0) ; RV32-NEXT: flw fs2, 56(s0) ; RV32-NEXT: flw fs3, 60(s0) -; RV32-NEXT: fsw fs0, 112(sp) -; RV32-NEXT: fsw fs1, 116(sp) -; RV32-NEXT: fsw fs2, 120(sp) -; RV32-NEXT: fsw fs3, 124(sp) -; RV32-NEXT: fsw ft8, 96(sp) -; RV32-NEXT: fsw ft9, 100(sp) -; RV32-NEXT: fsw ft10, 104(sp) -; RV32-NEXT: fsw ft11, 108(sp) -; RV32-NEXT: fsw ft4, 80(sp) -; RV32-NEXT: fsw ft5, 84(sp) -; RV32-NEXT: fsw ft6, 88(sp) -; RV32-NEXT: fsw ft7, 92(sp) -; RV32-NEXT: fsw ft0, 64(sp) -; RV32-NEXT: fsw ft1, 68(sp) -; RV32-NEXT: fsw ft2, 72(sp) -; RV32-NEXT: fsw ft3, 76(sp) ; RV32-NEXT: sw a4, 48(sp) ; RV32-NEXT: sw a5, 52(sp) ; RV32-NEXT: sw a6, 56(sp) @@ -626,6 +610,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3, ; RV32-NEXT: fsw fa2, 8(sp) ; RV32-NEXT: fsw fa3, 12(sp) ; RV32-NEXT: li a0, 32 +; RV32-NEXT: fsw fs0, 112(sp) +; RV32-NEXT: fsw fs1, 116(sp) +; RV32-NEXT: fsw fs2, 120(sp) +; RV32-NEXT: fsw fs3, 124(sp) +; RV32-NEXT: fsw ft8, 96(sp) +; RV32-NEXT: fsw ft9, 100(sp) +; RV32-NEXT: fsw ft10, 104(sp) +; RV32-NEXT: fsw ft11, 108(sp) +; RV32-NEXT: fsw ft4, 80(sp) +; RV32-NEXT: fsw ft5, 84(sp) +; RV32-NEXT: fsw ft6, 88(sp) +; RV32-NEXT: fsw ft7, 92(sp) +; RV32-NEXT: fsw ft0, 64(sp) +; RV32-NEXT: fsw ft1, 68(sp) +; RV32-NEXT: fsw ft2, 72(sp) +; RV32-NEXT: fsw ft3, 76(sp) ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a1) @@ -682,22 +682,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3, ; RV64-NEXT: addi s0, sp, 256 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: fmv.w.x ft0, a0 -; RV64-NEXT: fmv.w.x ft1, a1 -; RV64-NEXT: fmv.w.x ft2, a2 -; RV64-NEXT: fmv.w.x ft3, a3 -; RV64-NEXT: fmv.w.x ft4, a4 -; RV64-NEXT: fmv.w.x ft5, a5 -; RV64-NEXT: fmv.w.x ft6, a6 -; RV64-NEXT: fmv.w.x ft7, a7 -; RV64-NEXT: flw ft8, 0(s0) -; RV64-NEXT: flw ft9, 8(s0) -; RV64-NEXT: flw ft10, 16(s0) -; RV64-NEXT: flw ft11, 24(s0) -; RV64-NEXT: flw fs0, 32(s0) -; RV64-NEXT: flw fs1, 40(s0) -; RV64-NEXT: flw fs2, 48(s0) -; RV64-NEXT: flw fs3, 56(s0) +; RV64-NEXT: fmv.w.x ft4, a0 +; RV64-NEXT: fmv.w.x ft5, a1 +; RV64-NEXT: fmv.w.x ft6, a2 +; RV64-NEXT: fmv.w.x ft7, a3 +; RV64-NEXT: fmv.w.x fs0, a4 +; RV64-NEXT: fmv.w.x fs1, a5 +; RV64-NEXT: fmv.w.x fs2, a6 +; RV64-NEXT: fmv.w.x fs3, a7 +; RV64-NEXT: flw ft0, 0(s0) +; RV64-NEXT: flw ft1, 8(s0) +; RV64-NEXT: flw ft2, 16(s0) +; RV64-NEXT: flw ft3, 24(s0) +; RV64-NEXT: flw ft8, 32(s0) +; RV64-NEXT: flw ft9, 40(s0) +; RV64-NEXT: flw ft10, 48(s0) +; RV64-NEXT: flw ft11, 56(s0) ; RV64-NEXT: flw fs4, 64(s0) ; RV64-NEXT: flw fs5, 72(s0) ; RV64-NEXT: flw fs6, 80(s0) @@ -706,22 +706,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3, ; RV64-NEXT: flw fs9, 104(s0) ; RV64-NEXT: flw fs10, 112(s0) ; RV64-NEXT: flw fs11, 120(s0) -; RV64-NEXT: fsw fs8, 112(sp) -; RV64-NEXT: fsw fs9, 116(sp) -; RV64-NEXT: fsw fs10, 120(sp) -; RV64-NEXT: fsw fs11, 124(sp) -; RV64-NEXT: fsw fs4, 96(sp) -; RV64-NEXT: fsw fs5, 100(sp) -; RV64-NEXT: fsw fs6, 104(sp) -; RV64-NEXT: fsw fs7, 108(sp) -; RV64-NEXT: fsw fs0, 80(sp) -; RV64-NEXT: fsw fs1, 84(sp) -; RV64-NEXT: fsw fs2, 88(sp) -; RV64-NEXT: fsw fs3, 92(sp) -; RV64-NEXT: fsw ft8, 64(sp) -; RV64-NEXT: fsw ft9, 68(sp) -; RV64-NEXT: fsw ft10, 72(sp) -; RV64-NEXT: fsw ft11, 76(sp) ; RV64-NEXT: fsw fa4, 16(sp) ; RV64-NEXT: fsw fa5, 20(sp) ; RV64-NEXT: fsw fa6, 24(sp) @@ -730,15 +714,31 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3, ; RV64-NEXT: fsw fa1, 4(sp) ; RV64-NEXT: fsw fa2, 8(sp) ; RV64-NEXT: fsw fa3, 12(sp) -; RV64-NEXT: fsw ft4, 48(sp) -; RV64-NEXT: fsw ft5, 52(sp) -; RV64-NEXT: fsw ft6, 56(sp) -; RV64-NEXT: fsw ft7, 60(sp) -; RV64-NEXT: fsw ft0, 32(sp) -; RV64-NEXT: fsw ft1, 36(sp) -; RV64-NEXT: fsw ft2, 40(sp) -; RV64-NEXT: fsw ft3, 44(sp) ; RV64-NEXT: li a0, 32 +; RV64-NEXT: fsw fs0, 48(sp) +; RV64-NEXT: fsw fs1, 52(sp) +; RV64-NEXT: fsw fs2, 56(sp) +; RV64-NEXT: fsw fs3, 60(sp) +; RV64-NEXT: fsw ft4, 32(sp) +; RV64-NEXT: fsw ft5, 36(sp) +; RV64-NEXT: fsw ft6, 40(sp) +; RV64-NEXT: fsw ft7, 44(sp) +; RV64-NEXT: fsw fs8, 112(sp) +; RV64-NEXT: fsw fs9, 116(sp) +; RV64-NEXT: fsw fs10, 120(sp) +; RV64-NEXT: fsw fs11, 124(sp) +; RV64-NEXT: fsw fs4, 96(sp) +; RV64-NEXT: fsw fs5, 100(sp) +; RV64-NEXT: fsw fs6, 104(sp) +; RV64-NEXT: fsw fs7, 108(sp) +; RV64-NEXT: fsw ft8, 80(sp) +; RV64-NEXT: fsw ft9, 84(sp) +; RV64-NEXT: fsw ft10, 88(sp) +; RV64-NEXT: fsw ft11, 92(sp) +; RV64-NEXT: fsw ft0, 64(sp) +; RV64-NEXT: fsw ft1, 68(sp) +; RV64-NEXT: fsw ft2, 72(sp) +; RV64-NEXT: fsw ft3, 76(sp) ; RV64-NEXT: mv a1, sp ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a1) @@ -899,24 +899,24 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: sw a0, 120(sp) ; RV32-NEXT: sw a1, 124(sp) -; RV32-NEXT: fld ft0, 120(sp) +; RV32-NEXT: fld ft0, 0(s0) +; RV32-NEXT: fld ft1, 8(s0) +; RV32-NEXT: fld ft2, 16(s0) +; RV32-NEXT: fld ft3, 24(s0) +; RV32-NEXT: fld ft4, 120(sp) ; RV32-NEXT: sw a2, 120(sp) ; RV32-NEXT: sw a3, 124(sp) -; RV32-NEXT: fld ft1, 120(sp) +; RV32-NEXT: fld ft5, 120(sp) ; RV32-NEXT: sw a4, 120(sp) ; RV32-NEXT: sw a5, 124(sp) -; RV32-NEXT: fld ft2, 120(sp) +; RV32-NEXT: fld ft6, 120(sp) ; RV32-NEXT: sw a6, 120(sp) ; RV32-NEXT: sw a7, 124(sp) -; RV32-NEXT: fld ft3, 120(sp) -; RV32-NEXT: fld ft4, 0(s0) -; RV32-NEXT: fld ft5, 8(s0) -; RV32-NEXT: fld ft6, 16(s0) -; RV32-NEXT: fld ft7, 24(s0) -; RV32-NEXT: fsd ft4, 224(sp) -; RV32-NEXT: fsd ft5, 232(sp) -; RV32-NEXT: fsd ft6, 240(sp) -; RV32-NEXT: fsd ft7, 248(sp) +; RV32-NEXT: fld ft7, 120(sp) +; RV32-NEXT: fsd ft0, 224(sp) +; RV32-NEXT: fsd ft1, 232(sp) +; RV32-NEXT: fsd ft2, 240(sp) +; RV32-NEXT: fsd ft3, 248(sp) ; RV32-NEXT: fsd fa4, 160(sp) ; RV32-NEXT: fsd fa5, 168(sp) ; RV32-NEXT: fsd fa6, 176(sp) @@ -925,10 +925,10 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double ; RV32-NEXT: fsd fa1, 136(sp) ; RV32-NEXT: fsd fa2, 144(sp) ; RV32-NEXT: fsd fa3, 152(sp) -; RV32-NEXT: fsd ft0, 192(sp) -; RV32-NEXT: fsd ft1, 200(sp) -; RV32-NEXT: fsd ft2, 208(sp) -; RV32-NEXT: fsd ft3, 216(sp) +; RV32-NEXT: fsd ft4, 192(sp) +; RV32-NEXT: fsd ft5, 200(sp) +; RV32-NEXT: fsd ft6, 208(sp) +; RV32-NEXT: fsd ft7, 216(sp) ; RV32-NEXT: addi a0, sp, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) @@ -1038,56 +1038,58 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: sw a0, 120(sp) ; RV32-NEXT: sw a1, 124(sp) -; RV32-NEXT: fld ft0, 120(sp) +; RV32-NEXT: fld ft0, 0(s0) +; RV32-NEXT: fld ft1, 8(s0) +; RV32-NEXT: fld ft2, 16(s0) +; RV32-NEXT: fld ft3, 24(s0) +; RV32-NEXT: fld ft4, 32(s0) +; RV32-NEXT: fld ft5, 40(s0) +; RV32-NEXT: fld ft6, 48(s0) +; RV32-NEXT: fld ft7, 56(s0) +; RV32-NEXT: fld ft8, 64(s0) +; RV32-NEXT: fld ft9, 72(s0) +; RV32-NEXT: fld ft10, 80(s0) +; RV32-NEXT: fld ft11, 88(s0) +; RV32-NEXT: fld fs0, 96(s0) +; RV32-NEXT: fld fs1, 104(s0) +; RV32-NEXT: fld fs2, 112(s0) +; RV32-NEXT: fld fs3, 120(s0) +; RV32-NEXT: fld fs4, 128(s0) +; RV32-NEXT: fld fs5, 136(s0) +; RV32-NEXT: fld fs6, 144(s0) +; RV32-NEXT: fld fs7, 152(s0) +; RV32-NEXT: addi a0, sp, 128 +; RV32-NEXT: addi a1, sp, 256 +; RV32-NEXT: fld fs8, 120(sp) ; RV32-NEXT: sw a2, 120(sp) ; RV32-NEXT: sw a3, 124(sp) -; RV32-NEXT: fld ft1, 120(sp) +; RV32-NEXT: fld fs9, 120(sp) ; RV32-NEXT: sw a4, 120(sp) ; RV32-NEXT: sw a5, 124(sp) -; RV32-NEXT: fld ft2, 120(sp) +; RV32-NEXT: fld fs10, 120(sp) ; RV32-NEXT: sw a6, 120(sp) ; RV32-NEXT: sw a7, 124(sp) -; RV32-NEXT: fld ft3, 120(sp) -; RV32-NEXT: fld ft4, 0(s0) -; RV32-NEXT: fld ft5, 8(s0) -; RV32-NEXT: fld ft6, 16(s0) -; RV32-NEXT: fld ft7, 24(s0) -; RV32-NEXT: fld ft8, 32(s0) -; RV32-NEXT: fld ft9, 40(s0) -; RV32-NEXT: fld ft10, 48(s0) -; RV32-NEXT: fld ft11, 56(s0) -; RV32-NEXT: fld fs0, 64(s0) -; RV32-NEXT: fld fs1, 72(s0) -; RV32-NEXT: fld fs2, 80(s0) -; RV32-NEXT: fld fs3, 88(s0) -; RV32-NEXT: fld fs4, 96(s0) -; RV32-NEXT: fld fs5, 104(s0) -; RV32-NEXT: fld fs6, 112(s0) -; RV32-NEXT: fld fs7, 120(s0) -; RV32-NEXT: fld fs8, 128(s0) -; RV32-NEXT: fld fs9, 136(s0) -; RV32-NEXT: fld fs10, 144(s0) -; RV32-NEXT: fld fs11, 152(s0) -; RV32-NEXT: fsd fs8, 224(sp) -; RV32-NEXT: fsd fs9, 232(sp) -; RV32-NEXT: fsd fs10, 240(sp) -; RV32-NEXT: fsd fs11, 248(sp) -; RV32-NEXT: fsd fs4, 192(sp) -; RV32-NEXT: fsd fs5, 200(sp) -; RV32-NEXT: fsd fs6, 208(sp) -; RV32-NEXT: fsd fs7, 216(sp) -; RV32-NEXT: fsd fs0, 160(sp) -; RV32-NEXT: fsd fs1, 168(sp) -; RV32-NEXT: fsd fs2, 176(sp) -; RV32-NEXT: fsd fs3, 184(sp) -; RV32-NEXT: fsd ft8, 128(sp) -; RV32-NEXT: fsd ft9, 136(sp) -; RV32-NEXT: fsd ft10, 144(sp) -; RV32-NEXT: fsd ft11, 152(sp) -; RV32-NEXT: fsd ft4, 352(sp) -; RV32-NEXT: fsd ft5, 360(sp) -; RV32-NEXT: fsd ft6, 368(sp) -; RV32-NEXT: fsd ft7, 376(sp) +; RV32-NEXT: fld fs11, 120(sp) +; RV32-NEXT: fsd fs4, 224(sp) +; RV32-NEXT: fsd fs5, 232(sp) +; RV32-NEXT: fsd fs6, 240(sp) +; RV32-NEXT: fsd fs7, 248(sp) +; RV32-NEXT: fsd fs0, 192(sp) +; RV32-NEXT: fsd fs1, 200(sp) +; RV32-NEXT: fsd fs2, 208(sp) +; RV32-NEXT: fsd fs3, 216(sp) +; RV32-NEXT: fsd ft8, 160(sp) +; RV32-NEXT: fsd ft9, 168(sp) +; RV32-NEXT: fsd ft10, 176(sp) +; RV32-NEXT: fsd ft11, 184(sp) +; RV32-NEXT: fsd ft4, 128(sp) +; RV32-NEXT: fsd ft5, 136(sp) +; RV32-NEXT: fsd ft6, 144(sp) +; RV32-NEXT: fsd ft7, 152(sp) +; RV32-NEXT: fsd ft0, 352(sp) +; RV32-NEXT: fsd ft1, 360(sp) +; RV32-NEXT: fsd ft2, 368(sp) +; RV32-NEXT: fsd ft3, 376(sp) ; RV32-NEXT: fsd fa4, 288(sp) ; RV32-NEXT: fsd fa5, 296(sp) ; RV32-NEXT: fsd fa6, 304(sp) @@ -1096,15 +1098,13 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double ; RV32-NEXT: fsd fa1, 264(sp) ; RV32-NEXT: fsd fa2, 272(sp) ; RV32-NEXT: fsd fa3, 280(sp) -; RV32-NEXT: fsd ft0, 320(sp) -; RV32-NEXT: fsd ft1, 328(sp) -; RV32-NEXT: fsd ft2, 336(sp) -; RV32-NEXT: fsd ft3, 344(sp) -; RV32-NEXT: addi a0, sp, 128 +; RV32-NEXT: fsd fs8, 320(sp) +; RV32-NEXT: fsd fs9, 328(sp) +; RV32-NEXT: fsd fs10, 336(sp) +; RV32-NEXT: fsd fs11, 344(sp) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v16, (a0) -; RV32-NEXT: addi a0, sp, 256 -; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v8, (a1) ; RV32-NEXT: addi sp, s0, -512 ; RV32-NEXT: .cfi_def_cfa sp, 512 ; RV32-NEXT: lw ra, 508(sp) # 4-byte Folded Reload @@ -1190,6 +1190,8 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double ; RV64-NEXT: fsd fa1, 136(sp) ; RV64-NEXT: fsd fa2, 144(sp) ; RV64-NEXT: fsd fa3, 152(sp) +; RV64-NEXT: addi a0, sp, 128 +; RV64-NEXT: mv a1, sp ; RV64-NEXT: fsd fs0, 96(sp) ; RV64-NEXT: fsd fs1, 104(sp) ; RV64-NEXT: fsd fs2, 112(sp) @@ -1206,11 +1208,9 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double ; RV64-NEXT: fsd ft1, 8(sp) ; RV64-NEXT: fsd ft2, 16(sp) ; RV64-NEXT: fsd ft3, 24(sp) -; RV64-NEXT: addi a0, sp, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: mv a0, sp -; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi sp, s0, -384 ; RV64-NEXT: .cfi_def_cfa sp, 384 ; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload @@ -1266,20 +1266,19 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) { ; RV32-LABEL: buildvec_v32f64_exact_vlen: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -112 -; RV32-NEXT: .cfi_def_cfa_offset 112 -; RV32-NEXT: fsd fs0, 104(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs1, 96(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs2, 88(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs3, 80(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs4, 72(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs5, 64(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs6, 56(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs7, 48(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs8, 40(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs9, 32(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs10, 24(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs11, 16(sp) # 8-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: fsd fs0, 88(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 80(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs2, 72(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs3, 64(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs4, 56(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs5, 48(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs6, 40(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs7, 32(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs8, 24(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs9, 16(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill ; RV32-NEXT: .cfi_offset fs0, -8 ; RV32-NEXT: .cfi_offset fs1, -16 ; RV32-NEXT: .cfi_offset fs2, -24 @@ -1291,84 +1290,82 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV32-NEXT: .cfi_offset fs8, -72 ; RV32-NEXT: .cfi_offset fs9, -80 ; RV32-NEXT: .cfi_offset fs10, -88 -; RV32-NEXT: .cfi_offset fs11, -96 -; RV32-NEXT: sw a6, 8(sp) -; RV32-NEXT: sw a7, 12(sp) -; RV32-NEXT: fld ft6, 8(sp) -; RV32-NEXT: sw a4, 8(sp) -; RV32-NEXT: sw a5, 12(sp) -; RV32-NEXT: fld ft7, 8(sp) -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a3, 12(sp) -; RV32-NEXT: fld ft8, 8(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: fld ft9, 8(sp) -; RV32-NEXT: fld ft0, 264(sp) -; RV32-NEXT: fld ft1, 256(sp) -; RV32-NEXT: fld ft2, 248(sp) -; RV32-NEXT: fld ft3, 240(sp) -; RV32-NEXT: fld ft4, 232(sp) -; RV32-NEXT: fld ft5, 224(sp) -; RV32-NEXT: fld ft10, 216(sp) -; RV32-NEXT: fld ft11, 208(sp) -; RV32-NEXT: fld fs0, 200(sp) +; RV32-NEXT: sw a6, 0(sp) +; RV32-NEXT: sw a7, 4(sp) +; RV32-NEXT: fld ft0, 248(sp) +; RV32-NEXT: fld ft1, 240(sp) +; RV32-NEXT: fld ft2, 232(sp) +; RV32-NEXT: fld ft3, 224(sp) +; RV32-NEXT: fld ft6, 216(sp) +; RV32-NEXT: fld ft8, 208(sp) +; RV32-NEXT: fld ft10, 200(sp) ; RV32-NEXT: fld fs1, 192(sp) -; RV32-NEXT: fld fs2, 184(sp) -; RV32-NEXT: fld fs3, 176(sp) -; RV32-NEXT: fld fs4, 152(sp) -; RV32-NEXT: fld fs5, 144(sp) -; RV32-NEXT: fld fs6, 168(sp) -; RV32-NEXT: fld fs7, 160(sp) -; RV32-NEXT: fld fs8, 136(sp) -; RV32-NEXT: fld fs9, 128(sp) -; RV32-NEXT: fld fs10, 120(sp) -; RV32-NEXT: fld fs11, 112(sp) +; RV32-NEXT: fld ft11, 184(sp) +; RV32-NEXT: fld fs4, 176(sp) +; RV32-NEXT: fld fs2, 168(sp) +; RV32-NEXT: fld fs5, 160(sp) +; RV32-NEXT: fld fs3, 136(sp) +; RV32-NEXT: fld fs6, 128(sp) +; RV32-NEXT: fld fs7, 152(sp) +; RV32-NEXT: fld fs8, 144(sp) +; RV32-NEXT: fld ft4, 120(sp) +; RV32-NEXT: fld ft5, 112(sp) +; RV32-NEXT: fld ft7, 104(sp) +; RV32-NEXT: fld ft9, 96(sp) ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vfmv.v.f v8, ft9 -; RV32-NEXT: vfslide1down.vf v12, v8, ft8 ; RV32-NEXT: vfmv.v.f v8, fa2 +; RV32-NEXT: fld fa2, 0(sp) +; RV32-NEXT: sw a4, 0(sp) +; RV32-NEXT: sw a5, 4(sp) +; RV32-NEXT: fld fs0, 0(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: fld fs9, 0(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: fld fs10, 0(sp) +; RV32-NEXT: vfmv.v.f v9, fs8 +; RV32-NEXT: vfmv.v.f v10, fs6 +; RV32-NEXT: vfmv.v.f v11, fs5 +; RV32-NEXT: vfmv.v.f v12, fs4 +; RV32-NEXT: vfmv.v.f v13, fs1 +; RV32-NEXT: vfslide1down.vf v17, v9, fs7 +; RV32-NEXT: vfslide1down.vf v16, v10, fs3 +; RV32-NEXT: vfslide1down.vf v18, v11, fs2 +; RV32-NEXT: vfmv.v.f v9, fs10 +; RV32-NEXT: vfslide1down.vf v19, v12, ft11 +; RV32-NEXT: vfslide1down.vf v20, v13, ft10 +; RV32-NEXT: vfslide1down.vf v12, v9, fs9 ; RV32-NEXT: vfslide1down.vf v9, v8, fa3 +; RV32-NEXT: vfmv.v.f v8, ft8 +; RV32-NEXT: vfslide1down.vf v21, v8, ft6 ; RV32-NEXT: vfmv.v.f v8, fa0 ; RV32-NEXT: vfslide1down.vf v8, v8, fa1 +; RV32-NEXT: vfmv.v.f v10, ft3 +; RV32-NEXT: vfslide1down.vf v22, v10, ft2 ; RV32-NEXT: vfmv.v.f v10, fa4 ; RV32-NEXT: vfslide1down.vf v10, v10, fa5 ; RV32-NEXT: vfmv.v.f v11, fa6 ; RV32-NEXT: vfslide1down.vf v11, v11, fa7 -; RV32-NEXT: vfmv.v.f v13, ft7 -; RV32-NEXT: vfslide1down.vf v13, v13, ft6 -; RV32-NEXT: vfmv.v.f v14, fs11 -; RV32-NEXT: vfslide1down.vf v14, v14, fs10 -; RV32-NEXT: vfmv.v.f v15, fs9 -; RV32-NEXT: vfslide1down.vf v15, v15, fs8 -; RV32-NEXT: vfmv.v.f v16, fs7 -; RV32-NEXT: vfslide1down.vf v17, v16, fs6 -; RV32-NEXT: vfmv.v.f v16, fs5 -; RV32-NEXT: vfslide1down.vf v16, v16, fs4 -; RV32-NEXT: vfmv.v.f v18, fs3 -; RV32-NEXT: vfslide1down.vf v18, v18, fs2 -; RV32-NEXT: vfmv.v.f v19, fs1 -; RV32-NEXT: vfslide1down.vf v19, v19, fs0 -; RV32-NEXT: vfmv.v.f v20, ft11 -; RV32-NEXT: vfslide1down.vf v20, v20, ft10 -; RV32-NEXT: vfmv.v.f v21, ft5 -; RV32-NEXT: vfslide1down.vf v21, v21, ft4 -; RV32-NEXT: vfmv.v.f v22, ft3 -; RV32-NEXT: vfslide1down.vf v22, v22, ft2 +; RV32-NEXT: vfmv.v.f v13, fs0 +; RV32-NEXT: vfslide1down.vf v13, v13, fa2 +; RV32-NEXT: vfmv.v.f v14, ft9 +; RV32-NEXT: vfslide1down.vf v14, v14, ft7 +; RV32-NEXT: vfmv.v.f v15, ft5 +; RV32-NEXT: vfslide1down.vf v15, v15, ft4 ; RV32-NEXT: vfmv.v.f v23, ft1 ; RV32-NEXT: vfslide1down.vf v23, v23, ft0 -; RV32-NEXT: fld fs0, 104(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs1, 96(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs2, 88(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs3, 80(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs4, 72(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs5, 64(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs6, 56(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs7, 48(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs8, 40(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs9, 32(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs10, 24(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs11, 16(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs3, 64(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs4, 56(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs5, 48(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs6, 40(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs7, 32(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs8, 24(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs9, 16(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs10, 8(sp) # 8-byte Folded Reload ; RV32-NEXT: .cfi_restore fs0 ; RV32-NEXT: .cfi_restore fs1 ; RV32-NEXT: .cfi_restore fs2 @@ -1380,8 +1377,7 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV32-NEXT: .cfi_restore fs8 ; RV32-NEXT: .cfi_restore fs9 ; RV32-NEXT: .cfi_restore fs10 -; RV32-NEXT: .cfi_restore fs11 -; RV32-NEXT: addi sp, sp, 112 +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1405,18 +1401,18 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: .cfi_offset fs5, -48 ; RV64-NEXT: .cfi_offset fs6, -56 ; RV64-NEXT: .cfi_offset fs7, -64 -; RV64-NEXT: fmv.d.x ft4, a7 -; RV64-NEXT: fmv.d.x ft5, a5 -; RV64-NEXT: fmv.d.x ft6, a3 -; RV64-NEXT: fmv.d.x ft7, a1 +; RV64-NEXT: fmv.d.x ft6, a7 +; RV64-NEXT: fmv.d.x ft9, a5 +; RV64-NEXT: fmv.d.x ft10, a3 +; RV64-NEXT: fmv.d.x ft11, a1 ; RV64-NEXT: fld ft0, 184(sp) ; RV64-NEXT: fld ft1, 176(sp) ; RV64-NEXT: fld ft2, 168(sp) ; RV64-NEXT: fld ft3, 160(sp) -; RV64-NEXT: fld ft8, 152(sp) -; RV64-NEXT: fld ft9, 144(sp) -; RV64-NEXT: fld ft10, 136(sp) -; RV64-NEXT: fld ft11, 128(sp) +; RV64-NEXT: fld ft4, 152(sp) +; RV64-NEXT: fld ft5, 144(sp) +; RV64-NEXT: fld ft7, 136(sp) +; RV64-NEXT: fld ft8, 128(sp) ; RV64-NEXT: fld fs0, 120(sp) ; RV64-NEXT: fld fs1, 112(sp) ; RV64-NEXT: fld fs2, 104(sp) @@ -1427,33 +1423,33 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: fld fs7, 80(sp) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vfmv.v.f v8, fa2 +; RV64-NEXT: vfmv.v.f v10, fa0 +; RV64-NEXT: vfmv.v.f v11, fa4 +; RV64-NEXT: vfmv.v.f v12, fa6 +; RV64-NEXT: vmv.v.x v13, a0 +; RV64-NEXT: vmv.v.x v14, a2 ; RV64-NEXT: vfslide1down.vf v9, v8, fa3 -; RV64-NEXT: vfmv.v.f v8, fa0 -; RV64-NEXT: vfslide1down.vf v8, v8, fa1 -; RV64-NEXT: vfmv.v.f v10, fa4 -; RV64-NEXT: vfslide1down.vf v10, v10, fa5 -; RV64-NEXT: vfmv.v.f v11, fa6 -; RV64-NEXT: vfslide1down.vf v11, v11, fa7 -; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vfslide1down.vf v12, v12, ft7 -; RV64-NEXT: vmv.v.x v13, a2 -; RV64-NEXT: vfslide1down.vf v13, v13, ft6 -; RV64-NEXT: vmv.v.x v14, a4 -; RV64-NEXT: vfslide1down.vf v14, v14, ft5 -; RV64-NEXT: vmv.v.x v15, a6 -; RV64-NEXT: vfslide1down.vf v15, v15, ft4 -; RV64-NEXT: vfmv.v.f v16, fs7 -; RV64-NEXT: vfslide1down.vf v17, v16, fs6 +; RV64-NEXT: vfslide1down.vf v8, v10, fa1 +; RV64-NEXT: vfslide1down.vf v10, v11, fa5 +; RV64-NEXT: vfslide1down.vf v11, v12, fa7 +; RV64-NEXT: vfmv.v.f v15, fs7 ; RV64-NEXT: vfmv.v.f v16, fs5 +; RV64-NEXT: vfslide1down.vf v12, v13, ft11 +; RV64-NEXT: vfslide1down.vf v13, v14, ft10 +; RV64-NEXT: vfslide1down.vf v17, v15, fs6 ; RV64-NEXT: vfslide1down.vf v16, v16, fs4 -; RV64-NEXT: vfmv.v.f v18, fs3 -; RV64-NEXT: vfslide1down.vf v18, v18, fs2 +; RV64-NEXT: vmv.v.x v14, a4 +; RV64-NEXT: vfslide1down.vf v14, v14, ft9 +; RV64-NEXT: vfmv.v.f v15, fs3 +; RV64-NEXT: vfslide1down.vf v18, v15, fs2 +; RV64-NEXT: vmv.v.x v15, a6 +; RV64-NEXT: vfslide1down.vf v15, v15, ft6 ; RV64-NEXT: vfmv.v.f v19, fs1 ; RV64-NEXT: vfslide1down.vf v19, v19, fs0 -; RV64-NEXT: vfmv.v.f v20, ft11 -; RV64-NEXT: vfslide1down.vf v20, v20, ft10 -; RV64-NEXT: vfmv.v.f v21, ft9 -; RV64-NEXT: vfslide1down.vf v21, v21, ft8 +; RV64-NEXT: vfmv.v.f v20, ft8 +; RV64-NEXT: vfslide1down.vf v20, v20, ft7 +; RV64-NEXT: vfmv.v.f v21, ft5 +; RV64-NEXT: vfslide1down.vf v21, v21, ft4 ; RV64-NEXT: vfmv.v.f v22, ft3 ; RV64-NEXT: vfslide1down.vf v22, v22, ft2 ; RV64-NEXT: vfmv.v.f v23, ft1 @@ -1752,15 +1748,15 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa4 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v9, v8, fa3 -; CHECK-NEXT: vfmv.v.f v8, fa4 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 -; CHECK-NEXT: vmv.v.i v0, 15 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 -; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t +; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 +; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 +; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 +; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x float> poison, float %e0, i64 0 %v1 = insertelement <8 x float> %v0, float %e1, i64 1 @@ -1803,15 +1799,15 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa4 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v9, v8, fa3 -; CHECK-NEXT: vfmv.v.f v8, fa4 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 -; CHECK-NEXT: vmv.v.i v0, 15 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 -; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t +; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 +; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 +; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 +; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x double> poison, double %e0, i64 0 %v1 = insertelement <8 x double> %v0, double %e1, i64 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index 5ae47a01a37e1..e4b8e9debad27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -41,12 +41,10 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; V128-NEXT: vmv1r.v v12, v9 ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vid.v v9 +; V128-NEXT: vmv.v.i v0, 10 ; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t ; V128-NEXT: vmv.v.v v8, v10 ; V128-NEXT: ret @@ -244,26 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 ; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; V128-NEXT: vmv8r.v v0, v16 -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; V128-NEXT: vmv8r.v v24, v16 ; V128-NEXT: vmv8r.v v16, v8 +; V128-NEXT: vmv8r.v v8, v24 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v8, v0, 16 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v0, v8 +; V128-NEXT: vslidedown.vi v0, v24, 16 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v24, a0, v8 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v24, v8, v0 +; V128-NEXT: vwmaccu.vx v24, a0, v0 ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: lui a1, 699051 +; V128-NEXT: li a2, 32 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v8, v0, v16 -; V128-NEXT: vwmaccu.vx v8, a0, v16 -; V128-NEXT: lui a1, 699051 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: li a1, 32 -; V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; V128-NEXT: vwmaccu.vx v8, a0, v16 +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; V128-NEXT: vmerge.vvm v24, v8, v24, v0 ; V128-NEXT: addi a1, sp, 16 ; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll index 31e2d75e514b4..c14eae0b1de61 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -558,13 +558,11 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 -; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfeq.vv v8, v8, v8 +; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 ; ZVFHMIN-NEXT: vmerge.vim v8, v8, 1, v0 @@ -610,13 +608,11 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 -; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfne.vv v8, v8, v8 +; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 ; ZVFHMIN-NEXT: vmerge.vim v8, v8, 1, v0 @@ -1195,13 +1191,13 @@ define void @fcmp_ord_vf_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1249,13 +1245,13 @@ define void @fcmp_uno_vf_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1836,13 +1832,13 @@ define void @fcmp_ord_fv_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 @@ -1890,13 +1886,13 @@ define void @fcmp_uno_fv_v4f16(ptr %x, half %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index c24ade1e6d8ef..36bbec12e9b06 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -901,11 +901,11 @@ define void @copysign_vf_v8bf16(ptr %x, bfloat %y) { ; CHECK-NEXT: fmv.x.w a1, fa0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a2, 8 ; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: addi a1, a2, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret @@ -923,12 +923,12 @@ define void @copysign_vf_v6bf16(ptr %x, bfloat %y) { ; CHECK-NEXT: fmv.x.w a1, fa0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a2, 8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: addi a1, a2, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) @@ -955,11 +955,11 @@ define void @copysign_vf_v8f16(ptr %x, half %y) { ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: addi a2, a1, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a2 -; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a2, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a2 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret @@ -985,12 +985,12 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: addi a2, a1, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a2 -; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a2, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a2 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) @@ -1042,8 +1042,8 @@ define void @copysign_neg_v8bf16(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vor.vv v8, v9, v8 @@ -1064,9 +1064,9 @@ define void @copysign_neg_v6bf16(ptr %x, ptr %y) { ; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a2, a1, -1 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: addi a2, a1, -1 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma @@ -1097,8 +1097,8 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a2 ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vor.vv v8, v9, v8 @@ -1128,9 +1128,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a2, a1, -1 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: addi a2, a1, -1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a2 ; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma @@ -1602,11 +1602,11 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-NEXT: vle16.v v10, (a1) ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -1630,11 +1630,11 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -1667,11 +1667,11 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-NEXT: vle16.v v10, (a1) ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -1705,11 +1705,11 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -3717,14 +3717,14 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a2 ; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vmv.v.x v10, a2 ; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -3746,15 +3746,15 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -3785,14 +3785,14 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vmv.v.x v10, a2 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vmv.v.x v10, a2 ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -3823,15 +3823,15 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a1) ; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -3929,11 +3929,11 @@ define void @trunc_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -3954,12 +3954,12 @@ define void @trunc_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -3995,11 +3995,11 @@ define void @trunc_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4036,12 +4036,12 @@ define void @trunc_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4062,9 +4062,9 @@ define void @trunc_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4104,11 +4104,11 @@ define void @ceil_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4131,12 +4131,12 @@ define void @ceil_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4176,11 +4176,11 @@ define void @ceil_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4221,12 +4221,12 @@ define void @ceil_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4249,9 +4249,9 @@ define void @ceil_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -4295,11 +4295,11 @@ define void @floor_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4322,12 +4322,12 @@ define void @floor_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4367,11 +4367,11 @@ define void @floor_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4412,12 +4412,12 @@ define void @floor_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4440,9 +4440,9 @@ define void @floor_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -4486,11 +4486,11 @@ define void @round_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4513,12 +4513,12 @@ define void @round_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4558,11 +4558,11 @@ define void @round_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4603,12 +4603,12 @@ define void @round_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4631,9 +4631,9 @@ define void @round_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -4677,11 +4677,11 @@ define void @rint_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4717,11 +4717,11 @@ define void @rint_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4742,9 +4742,9 @@ define void @rint_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4784,11 +4784,11 @@ define void @nearbyint_v8bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a1, 307200 -; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4828,11 +4828,11 @@ define void @nearbyint_v8f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a1, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -4855,9 +4855,9 @@ define void @nearbyint_v4f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index da0bc5983a125..7f4483a8f77d9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -124,21 +124,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) { ; ZVFH32: # %bb.0: ; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH32-NEXT: vfncvt.rtz.x.f.w v9, v8 +; ZVFH32-NEXT: lui a1, 8 ; ZVFH32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFH32-NEXT: vmv.x.s a1, v8 ; ZVFH32-NEXT: vmv.x.s a2, v9 -; ZVFH32-NEXT: lui a3, 8 -; ZVFH32-NEXT: addi a3, a3, -1 -; ZVFH32-NEXT: and a2, a2, a3 -; ZVFH32-NEXT: vslidedown.vi v8, v9, 1 -; ZVFH32-NEXT: vmv.x.s a4, v8 -; ZVFH32-NEXT: and a3, a4, a3 -; ZVFH32-NEXT: slli a4, a1, 17 +; ZVFH32-NEXT: addi a1, a1, -1 +; ZVFH32-NEXT: vslidedown.vi v9, v9, 1 +; ZVFH32-NEXT: vmv.x.s a3, v8 +; ZVFH32-NEXT: and a2, a2, a1 +; ZVFH32-NEXT: vmv.x.s a4, v9 +; ZVFH32-NEXT: and a1, a4, a1 +; ZVFH32-NEXT: slli a4, a3, 17 +; ZVFH32-NEXT: slli a3, a3, 30 ; ZVFH32-NEXT: srli a4, a4, 19 -; ZVFH32-NEXT: slli a3, a3, 15 -; ZVFH32-NEXT: slli a1, a1, 30 +; ZVFH32-NEXT: slli a1, a1, 15 +; ZVFH32-NEXT: or a2, a2, a3 ; ZVFH32-NEXT: or a1, a2, a1 -; ZVFH32-NEXT: or a1, a1, a3 ; ZVFH32-NEXT: sw a1, 0(a0) ; ZVFH32-NEXT: sh a4, 4(a0) ; ZVFH32-NEXT: ret @@ -147,19 +147,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) { ; ZVFH64: # %bb.0: ; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; ZVFH64-NEXT: vmv.x.s a1, v9 -; ZVFH64-NEXT: lui a2, 8 -; ZVFH64-NEXT: addiw a2, a2, -1 -; ZVFH64-NEXT: and a1, a1, a2 +; ZVFH64-NEXT: lui a1, 8 +; ZVFH64-NEXT: vmv.x.s a2, v9 +; ZVFH64-NEXT: addiw a1, a1, -1 ; ZVFH64-NEXT: vslidedown.vi v8, v9, 1 +; ZVFH64-NEXT: vslidedown.vi v9, v9, 2 +; ZVFH64-NEXT: and a2, a2, a1 ; ZVFH64-NEXT: vmv.x.s a3, v8 -; ZVFH64-NEXT: and a2, a3, a2 -; ZVFH64-NEXT: slli a2, a2, 15 -; ZVFH64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFH64-NEXT: vmv.x.s a3, v8 +; ZVFH64-NEXT: and a1, a3, a1 +; ZVFH64-NEXT: vmv.x.s a3, v9 ; ZVFH64-NEXT: slli a3, a3, 30 -; ZVFH64-NEXT: or a1, a1, a3 -; ZVFH64-NEXT: or a1, a1, a2 +; ZVFH64-NEXT: slli a1, a1, 15 +; ZVFH64-NEXT: or a2, a2, a3 +; ZVFH64-NEXT: or a1, a2, a1 ; ZVFH64-NEXT: slli a2, a1, 19 ; ZVFH64-NEXT: srli a2, a2, 51 ; ZVFH64-NEXT: sw a1, 0(a0) @@ -170,21 +170,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) { ; ZVFHMIN32: # %bb.0: ; ZVFHMIN32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN32-NEXT: vfncvt.rtz.x.f.w v9, v8 +; ZVFHMIN32-NEXT: lui a1, 8 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 ; ZVFHMIN32-NEXT: vmv.x.s a2, v9 -; ZVFHMIN32-NEXT: lui a3, 8 -; ZVFHMIN32-NEXT: addi a3, a3, -1 -; ZVFHMIN32-NEXT: and a2, a2, a3 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 1 -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: and a3, a4, a3 -; ZVFHMIN32-NEXT: slli a4, a1, 17 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v9, 1 +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: and a2, a2, a1 +; ZVFHMIN32-NEXT: vmv.x.s a4, v9 +; ZVFHMIN32-NEXT: and a1, a4, a1 +; ZVFHMIN32-NEXT: slli a4, a3, 17 +; ZVFHMIN32-NEXT: slli a3, a3, 30 ; ZVFHMIN32-NEXT: srli a4, a4, 19 -; ZVFHMIN32-NEXT: slli a3, a3, 15 -; ZVFHMIN32-NEXT: slli a1, a1, 30 +; ZVFHMIN32-NEXT: slli a1, a1, 15 +; ZVFHMIN32-NEXT: or a2, a2, a3 ; ZVFHMIN32-NEXT: or a1, a2, a1 -; ZVFHMIN32-NEXT: or a1, a1, a3 ; ZVFHMIN32-NEXT: sw a1, 0(a0) ; ZVFHMIN32-NEXT: sh a4, 4(a0) ; ZVFHMIN32-NEXT: ret @@ -193,19 +193,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) { ; ZVFHMIN64: # %bb.0: ; ZVFHMIN64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; ZVFHMIN64-NEXT: vmv.x.s a1, v9 -; ZVFHMIN64-NEXT: lui a2, 8 -; ZVFHMIN64-NEXT: addiw a2, a2, -1 -; ZVFHMIN64-NEXT: and a1, a1, a2 +; ZVFHMIN64-NEXT: lui a1, 8 +; ZVFHMIN64-NEXT: vmv.x.s a2, v9 +; ZVFHMIN64-NEXT: addiw a1, a1, -1 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 1 +; ZVFHMIN64-NEXT: vslidedown.vi v9, v9, 2 +; ZVFHMIN64-NEXT: and a2, a2, a1 ; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: and a2, a3, a2 -; ZVFHMIN64-NEXT: slli a2, a2, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: and a1, a3, a1 +; ZVFHMIN64-NEXT: vmv.x.s a3, v9 ; ZVFHMIN64-NEXT: slli a3, a3, 30 -; ZVFHMIN64-NEXT: or a1, a1, a3 -; ZVFHMIN64-NEXT: or a1, a1, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 15 +; ZVFHMIN64-NEXT: or a2, a2, a3 +; ZVFHMIN64-NEXT: or a1, a2, a1 ; ZVFHMIN64-NEXT: slli a2, a1, 19 ; ZVFHMIN64-NEXT: srli a2, a2, 51 ; ZVFHMIN64-NEXT: sw a1, 0(a0) @@ -221,21 +221,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) { ; ZVFH32: # %bb.0: ; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH32-NEXT: vfncvt.rtz.x.f.w v9, v8 +; ZVFH32-NEXT: lui a1, 16 ; ZVFH32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFH32-NEXT: vmv.x.s a1, v8 ; ZVFH32-NEXT: vmv.x.s a2, v9 -; ZVFH32-NEXT: lui a3, 16 -; ZVFH32-NEXT: addi a3, a3, -1 -; ZVFH32-NEXT: and a2, a2, a3 -; ZVFH32-NEXT: vslidedown.vi v8, v9, 1 -; ZVFH32-NEXT: vmv.x.s a4, v8 -; ZVFH32-NEXT: and a3, a4, a3 -; ZVFH32-NEXT: slli a4, a1, 17 +; ZVFH32-NEXT: addi a1, a1, -1 +; ZVFH32-NEXT: vslidedown.vi v9, v9, 1 +; ZVFH32-NEXT: vmv.x.s a3, v8 +; ZVFH32-NEXT: and a2, a2, a1 +; ZVFH32-NEXT: vmv.x.s a4, v9 +; ZVFH32-NEXT: and a1, a4, a1 +; ZVFH32-NEXT: slli a4, a3, 17 +; ZVFH32-NEXT: slli a3, a3, 30 ; ZVFH32-NEXT: srli a4, a4, 19 -; ZVFH32-NEXT: slli a3, a3, 15 -; ZVFH32-NEXT: slli a1, a1, 30 +; ZVFH32-NEXT: slli a1, a1, 15 +; ZVFH32-NEXT: or a2, a2, a3 ; ZVFH32-NEXT: or a1, a2, a1 -; ZVFH32-NEXT: or a1, a1, a3 ; ZVFH32-NEXT: sw a1, 0(a0) ; ZVFH32-NEXT: sh a4, 4(a0) ; ZVFH32-NEXT: ret @@ -244,19 +244,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) { ; ZVFH64: # %bb.0: ; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; ZVFH64-NEXT: vmv.x.s a1, v9 -; ZVFH64-NEXT: lui a2, 16 -; ZVFH64-NEXT: addiw a2, a2, -1 -; ZVFH64-NEXT: and a1, a1, a2 +; ZVFH64-NEXT: lui a1, 16 +; ZVFH64-NEXT: vmv.x.s a2, v9 +; ZVFH64-NEXT: addiw a1, a1, -1 ; ZVFH64-NEXT: vslidedown.vi v8, v9, 1 +; ZVFH64-NEXT: vslidedown.vi v9, v9, 2 +; ZVFH64-NEXT: and a2, a2, a1 ; ZVFH64-NEXT: vmv.x.s a3, v8 -; ZVFH64-NEXT: and a2, a3, a2 -; ZVFH64-NEXT: slli a2, a2, 15 -; ZVFH64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFH64-NEXT: vmv.x.s a3, v8 +; ZVFH64-NEXT: and a1, a3, a1 +; ZVFH64-NEXT: vmv.x.s a3, v9 ; ZVFH64-NEXT: slli a3, a3, 30 -; ZVFH64-NEXT: or a1, a1, a3 -; ZVFH64-NEXT: or a1, a1, a2 +; ZVFH64-NEXT: slli a1, a1, 15 +; ZVFH64-NEXT: or a2, a2, a3 +; ZVFH64-NEXT: or a1, a2, a1 ; ZVFH64-NEXT: slli a2, a1, 19 ; ZVFH64-NEXT: srli a2, a2, 51 ; ZVFH64-NEXT: sw a1, 0(a0) @@ -267,21 +267,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) { ; ZVFHMIN32: # %bb.0: ; ZVFHMIN32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN32-NEXT: vfncvt.rtz.x.f.w v9, v8 +; ZVFHMIN32-NEXT: lui a1, 16 ; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 ; ZVFHMIN32-NEXT: vmv.x.s a2, v9 -; ZVFHMIN32-NEXT: lui a3, 16 -; ZVFHMIN32-NEXT: addi a3, a3, -1 -; ZVFHMIN32-NEXT: and a2, a2, a3 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v9, 1 -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: and a3, a4, a3 -; ZVFHMIN32-NEXT: slli a4, a1, 17 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v9, 1 +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: and a2, a2, a1 +; ZVFHMIN32-NEXT: vmv.x.s a4, v9 +; ZVFHMIN32-NEXT: and a1, a4, a1 +; ZVFHMIN32-NEXT: slli a4, a3, 17 +; ZVFHMIN32-NEXT: slli a3, a3, 30 ; ZVFHMIN32-NEXT: srli a4, a4, 19 -; ZVFHMIN32-NEXT: slli a3, a3, 15 -; ZVFHMIN32-NEXT: slli a1, a1, 30 +; ZVFHMIN32-NEXT: slli a1, a1, 15 +; ZVFHMIN32-NEXT: or a2, a2, a3 ; ZVFHMIN32-NEXT: or a1, a2, a1 -; ZVFHMIN32-NEXT: or a1, a1, a3 ; ZVFHMIN32-NEXT: sw a1, 0(a0) ; ZVFHMIN32-NEXT: sh a4, 4(a0) ; ZVFHMIN32-NEXT: ret @@ -290,19 +290,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) { ; ZVFHMIN64: # %bb.0: ; ZVFHMIN64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; ZVFHMIN64-NEXT: vmv.x.s a1, v9 -; ZVFHMIN64-NEXT: lui a2, 16 -; ZVFHMIN64-NEXT: addiw a2, a2, -1 -; ZVFHMIN64-NEXT: and a1, a1, a2 +; ZVFHMIN64-NEXT: lui a1, 16 +; ZVFHMIN64-NEXT: vmv.x.s a2, v9 +; ZVFHMIN64-NEXT: addiw a1, a1, -1 ; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 1 +; ZVFHMIN64-NEXT: vslidedown.vi v9, v9, 2 +; ZVFHMIN64-NEXT: and a2, a2, a1 ; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: and a2, a3, a2 -; ZVFHMIN64-NEXT: slli a2, a2, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v9, 2 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: and a1, a3, a1 +; ZVFHMIN64-NEXT: vmv.x.s a3, v9 ; ZVFHMIN64-NEXT: slli a3, a3, 30 -; ZVFHMIN64-NEXT: or a1, a1, a3 -; ZVFHMIN64-NEXT: or a1, a1, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 15 +; ZVFHMIN64-NEXT: or a2, a2, a3 +; ZVFHMIN64-NEXT: or a1, a2, a1 ; ZVFHMIN64-NEXT: slli a2, a1, 19 ; ZVFHMIN64-NEXT: srli a2, a2, 51 ; ZVFHMIN64-NEXT: sw a1, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll index 131fa53b35999..be32c033fe373 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll @@ -125,10 +125,10 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: round_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -150,10 +150,10 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -173,10 +173,10 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -196,10 +196,10 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -219,10 +219,10 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -242,10 +242,10 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll index b21be367f8ef5..774ce5c7859c9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll @@ -30,9 +30,9 @@ define <1 x half> @round_v1f16(<1 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -69,9 +69,9 @@ define <2 x half> @round_v2f16(<2 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -108,9 +108,9 @@ define <4 x half> @round_v4f16(<4 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -147,9 +147,9 @@ define <8 x half> @round_v8f16(<8 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -186,9 +186,9 @@ define <16 x half> @round_v16f16(<16 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -225,12 +225,12 @@ define <32 x half> @round_v32f16(<32 x half> %x) { ; ZVFHMIN-LABEL: round_v32f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: li a0, 32 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll index 37f2e59ad7516..5c0279e133dfa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll @@ -125,10 +125,10 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -150,10 +150,10 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -173,10 +173,10 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -196,10 +196,10 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -219,10 +219,10 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -242,10 +242,10 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll index 13d62bb24441c..0b6baad127643 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll @@ -30,9 +30,9 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -69,9 +69,9 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -108,9 +108,9 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -147,9 +147,9 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -186,9 +186,9 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -225,12 +225,12 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) { ; ZVFHMIN-LABEL: roundeven_v32f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: li a0, 32 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll index 86abfb771162f..a68dc11f3d21e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll @@ -674,10 +674,10 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t @@ -710,11 +710,11 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll index b911722368ce3..2173887e85417 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll @@ -113,10 +113,10 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: trunc_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -136,10 +136,10 @@ define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -157,10 +157,10 @@ define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -178,10 +178,10 @@ define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -199,10 +199,10 @@ define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t @@ -220,10 +220,10 @@ define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll index f7737784d4ca5..986636d974aca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -19,8 +19,8 @@ define <1 x i1> @insertelt_idx_v1i1(<1 x i1> %x, i1 %elt, i32 zeroext %idx) noun ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 @@ -52,8 +52,8 @@ define <2 x i1> @insertelt_idx_v2i1(<2 x i1> %x, i1 %elt, i32 zeroext %idx) noun ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 @@ -87,8 +87,8 @@ define <8 x i1> @insertelt_idx_v8i1(<8 x i1> %x, i1 %elt, i32 zeroext %idx) noun ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 5581754b0721a..1fbc8dfd688c4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -756,15 +756,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32VLA-NEXT: vle64.v v8, (a0) ; RV32VLA-NEXT: addi a0, sp, 128 +; RV32VLA-NEXT: csrr a2, vlenb +; RV32VLA-NEXT: addi a3, sp, 64 +; RV32VLA-NEXT: slli a2, a2, 3 ; RV32VLA-NEXT: vse64.v v8, (a0) -; RV32VLA-NEXT: csrr a0, vlenb -; RV32VLA-NEXT: slli a0, a0, 3 -; RV32VLA-NEXT: addi a2, sp, 64 -; RV32VLA-NEXT: add a3, a2, a0 -; RV32VLA-NEXT: vl8re64.v v8, (a3) -; RV32VLA-NEXT: vl8re64.v v16, (a2) -; RV32VLA-NEXT: add a0, a1, a0 -; RV32VLA-NEXT: vs8r.v v8, (a0) +; RV32VLA-NEXT: add a0, a3, a2 +; RV32VLA-NEXT: vl8re64.v v8, (a0) +; RV32VLA-NEXT: vl8re64.v v16, (a3) +; RV32VLA-NEXT: add a2, a1, a2 +; RV32VLA-NEXT: vs8r.v v8, (a2) ; RV32VLA-NEXT: vs8r.v v16, (a1) ; RV32VLA-NEXT: addi sp, s0, -80 ; RV32VLA-NEXT: .cfi_def_cfa sp, 80 @@ -793,15 +793,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV64VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64VLA-NEXT: vle64.v v8, (a0) ; RV64VLA-NEXT: addi a0, sp, 128 +; RV64VLA-NEXT: csrr a2, vlenb +; RV64VLA-NEXT: addi a3, sp, 64 +; RV64VLA-NEXT: slli a2, a2, 3 ; RV64VLA-NEXT: vse64.v v8, (a0) -; RV64VLA-NEXT: csrr a0, vlenb -; RV64VLA-NEXT: slli a0, a0, 3 -; RV64VLA-NEXT: addi a2, sp, 64 -; RV64VLA-NEXT: add a3, a2, a0 -; RV64VLA-NEXT: vl8re64.v v8, (a3) -; RV64VLA-NEXT: vl8re64.v v16, (a2) -; RV64VLA-NEXT: add a0, a1, a0 -; RV64VLA-NEXT: vs8r.v v8, (a0) +; RV64VLA-NEXT: add a0, a3, a2 +; RV64VLA-NEXT: vl8re64.v v8, (a0) +; RV64VLA-NEXT: vl8re64.v v16, (a3) +; RV64VLA-NEXT: add a2, a1, a2 +; RV64VLA-NEXT: vs8r.v v8, (a2) ; RV64VLA-NEXT: vs8r.v v16, (a1) ; RV64VLA-NEXT: addi sp, s0, -80 ; RV64VLA-NEXT: .cfi_def_cfa sp, 80 @@ -828,9 +828,9 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32VLS-NEXT: vl1re64.v v8, (a0) ; RV32VLS-NEXT: addi a0, sp, 128 ; RV32VLS-NEXT: vs1r.v v8, (a0) +; RV32VLS-NEXT: addi a0, sp, 192 +; RV32VLS-NEXT: vl8re64.v v8, (a0) ; RV32VLS-NEXT: addi a0, sp, 64 -; RV32VLS-NEXT: addi a2, sp, 192 -; RV32VLS-NEXT: vl8re64.v v8, (a2) ; RV32VLS-NEXT: vl8re64.v v16, (a0) ; RV32VLS-NEXT: addi a0, a1, 128 ; RV32VLS-NEXT: vs8r.v v8, (a0) @@ -860,9 +860,9 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV64VLS-NEXT: vl1re64.v v8, (a0) ; RV64VLS-NEXT: addi a0, sp, 128 ; RV64VLS-NEXT: vs1r.v v8, (a0) +; RV64VLS-NEXT: addi a0, sp, 192 +; RV64VLS-NEXT: vl8re64.v v8, (a0) ; RV64VLS-NEXT: addi a0, sp, 64 -; RV64VLS-NEXT: addi a2, sp, 192 -; RV64VLS-NEXT: vl8re64.v v8, (a2) ; RV64VLS-NEXT: vl8re64.v v16, (a0) ; RV64VLS-NEXT: addi a0, a1, 128 ; RV64VLS-NEXT: vs8r.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 0ff3641483ddb..6782b2003ba94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -122,14 +122,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx) ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: andi a1, a1, 63 -; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: mv a2, sp -; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: addi a3, sp, 128 ; RV32-NEXT: li a4, 32 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma ; RV32-NEXT: vse32.v v16, (a3) ; RV32-NEXT: vse32.v v8, (a2) +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: sw a0, 0(a1) ; RV32-NEXT: vle32.v v8, (a2) ; RV32-NEXT: vle32.v v16, (a3) @@ -155,14 +155,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx) ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: andi a1, a1, 63 -; RV64-NEXT: slli a1, a1, 2 ; RV64-NEXT: mv a2, sp -; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: addi a3, sp, 128 ; RV64-NEXT: li a4, 32 +; RV64-NEXT: slli a1, a1, 2 ; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma ; RV64-NEXT: vse32.v v16, (a3) ; RV64-NEXT: vse32.v v8, (a2) +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: sw a0, 0(a1) ; RV64-NEXT: vle32.v v8, (a2) ; RV64-NEXT: vle32.v v16, (a3) @@ -228,17 +228,17 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v9, v8, 3 -; RV32-NEXT: vmv.x.s a2, v9 -; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vmv.x.s a3, v9 -; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vmv.x.s a4, v9 +; RV32-NEXT: vmv.x.s a4, v10 ; RV32-NEXT: vmv.x.s a5, v8 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v8, a5 +; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a5 ; RV32-NEXT: vslide1down.vx v8, v8, a4 ; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslidedown.vi v8, v8, 2 @@ -248,11 +248,11 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vmv.x.s a1, v9 -; RV64-NEXT: vmv.x.s a2, v8 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: vmv.x.s a2, v9 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a2 ; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: ret @@ -313,9 +313,9 @@ define <32 x i16> @insertelt_v32i16(<32 x i16> %a, i16 %y, i32 %idx) { ; RV64-LABEL: insertelt_v32i16: ; RV64: # %bb.0: ; RV64-NEXT: li a2, 32 +; RV64-NEXT: slli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a2, e16, m1, ta, ma ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: slli a1, a1, 32 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: addi a0, a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m4, tu, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 49644fa4a9fb8..e9fd0a19e3eb6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -697,12 +697,11 @@ define void @buildvec_seq_v9i8(ptr %x) { ; CHECK-LABEL: buildvec_seq_v9i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 73 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: li a1, 146 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: li a1, 146 ; CHECK-NEXT: vmv.s.x v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 @@ -728,49 +727,27 @@ define void @buildvec_seq_v4i16_v2i32(ptr %x) { } define void @buildvec_vid_step1o2_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ptr %z4, ptr %z5, ptr %z6) { -; RV32-LABEL: buildvec_vid_step1o2_v4i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vsrl.vi v8, v8, 1 -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: vse32.v v8, (a1) -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vse32.v v8, (a2) -; RV32-NEXT: vse32.v v8, (a3) -; RV32-NEXT: vse32.v v8, (a4) -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v9, v8, 1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vse32.v v9, (a5) -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: vse32.v v8, (a6) -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_vid_step1o2_v4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vsrl.vi v8, v8, 1 -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: vmv.v.i v9, 1 -; RV64-NEXT: vse32.v v8, (a1) -; RV64-NEXT: vse32.v v8, (a2) -; RV64-NEXT: vse32.v v8, (a3) -; RV64-NEXT: vse32.v v8, (a4) -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64-NEXT: vslideup.vi v9, v8, 1 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vse32.v v9, (a5) -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: vse32.v v8, (a6) -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_vid_step1o2_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vsrl.vi v9, v9, 1 +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v9, (a1) +; CHECK-NEXT: vse32.v v9, (a2) +; CHECK-NEXT: vse32.v v9, (a3) +; CHECK-NEXT: vse32.v v9, (a4) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 +; CHECK-NEXT: vse32.v v8, (a5) +; CHECK-NEXT: vse32.v v9, (a6) +; CHECK-NEXT: ret store <4 x i32> , ptr %z0 store <4 x i32> , ptr %z1 store <4 x i32> , ptr %z2 @@ -788,22 +765,22 @@ define void @buildvec_vid_step1o2_add3_v4i16(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v9, 3 ; CHECK-NEXT: vsrl.vi v8, v8, 1 ; CHECK-NEXT: vadd.vi v8, v8, 3 ; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: vmv.v.i v9, 3 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: vse16.v v8, (a4) ; CHECK-NEXT: vmv.v.i v8, 4 +; CHECK-NEXT: li a0, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslide1down.vx v9, v9, a0 ; CHECK-NEXT: vse16.v v8, (a5) -; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vslide1down.vx v8, v9, a0 -; CHECK-NEXT: vse16.v v8, (a6) +; CHECK-NEXT: vse16.v v9, (a6) ; CHECK-NEXT: ret store <4 x i16> , ptr %z0 store <4 x i16> , ptr %z1 @@ -997,21 +974,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vmerge.vim v10, v9, -1, v0 ; RV32-NEXT: li a0, 512 -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV32-NEXT: vmv.v.i v12, 3 ; RV32-NEXT: li a1, 240 -; RV32-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: li a1, 15 +; RV32-NEXT: vmerge.vim v10, v9, -1, v0 +; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: vmv.v.i v12, 3 +; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; RV32-NEXT: vmerge.vim v12, v12, 0, v0 ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma ; RV32-NEXT: vmerge.vim v10, v9, -1, v0 -; RV32-NEXT: li a1, 15 -; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: vmv.s.x v8, a1 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma @@ -1029,19 +1004,17 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.i v0, 3 ; RV64V-NEXT: vmv.v.i v9, 0 -; RV64V-NEXT: vmerge.vim v10, v9, -1, v0 ; RV64V-NEXT: li a0, 512 +; RV64V-NEXT: vmv.v.i v8, 12 +; RV64V-NEXT: li a1, 48 +; RV64V-NEXT: vmerge.vim v10, v9, -1, v0 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64V-NEXT: vmv.v.i v12, 3 -; RV64V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV64V-NEXT: vmv.v.i v8, 12 ; RV64V-NEXT: vmv1r.v v0, v10 -; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64V-NEXT: vmerge.vim v12, v12, 0, v0 ; RV64V-NEXT: vmv1r.v v0, v8 ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; RV64V-NEXT: vmerge.vim v10, v9, -1, v0 -; RV64V-NEXT: li a1, 48 ; RV64V-NEXT: vmv.s.x v8, a1 ; RV64V-NEXT: vmv.v.v v0, v10 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma @@ -1059,21 +1032,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.i v0, 15 ; RV64ZVE32-NEXT: vmv.v.i v9, 0 -; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0 ; RV64ZVE32-NEXT: li a0, 512 -; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV64ZVE32-NEXT: vmv.v.i v12, 3 ; RV64ZVE32-NEXT: li a1, 240 -; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; RV64ZVE32-NEXT: vmv.s.x v8, a1 +; RV64ZVE32-NEXT: li a1, 15 +; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0 +; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64ZVE32-NEXT: vmv.v.i v12, 3 +; RV64ZVE32-NEXT: slli a1, a1, 8 ; RV64ZVE32-NEXT: vmv1r.v v0, v10 -; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; RV64ZVE32-NEXT: vmerge.vim v12, v12, 0, v0 ; RV64ZVE32-NEXT: vmv1r.v v0, v8 ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0 -; RV64ZVE32-NEXT: li a1, 15 -; RV64ZVE32-NEXT: slli a1, a1, 8 ; RV64ZVE32-NEXT: vmv.s.x v8, a1 ; RV64ZVE32-NEXT: vmv.v.v v0, v10 ; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma @@ -1091,9 +1062,10 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, define <8 x i32> @prefix_overwrite(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: prefix_overwrite: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m1, tu, ma ; CHECK-NEXT: vmv.s.x v10, a1 +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: vmv.s.x v10, a2 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma @@ -1137,12 +1109,12 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v10, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v10, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: ret ; @@ -1186,13 +1158,13 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i ; RV32-NEXT: lw s0, 16(sp) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v10, v9, a2 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v10, a3 ; RV32-NEXT: vmv.v.x v10, s0 ; RV32-NEXT: vslide1down.vx v10, v10, t6 ; RV32-NEXT: vslide1down.vx v10, v10, t5 @@ -1211,9 +1183,9 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 +; RV64V-NEXT: vmv.v.x v10, a0 ; RV64V-NEXT: vslide1down.vx v9, v8, a3 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: vslide1down.vx v8, v10, a1 ; RV64V-NEXT: vmv.v.x v10, a4 ; RV64V-NEXT: vslide1down.vx v10, v10, a5 ; RV64V-NEXT: vmv.v.x v11, a6 @@ -1248,12 +1220,12 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v10, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v10, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: vmv.v.v v10, v8 ; RV32-NEXT: vmv.v.v v11, v9 @@ -1297,12 +1269,12 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v10, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v10, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: ret ; @@ -1334,12 +1306,12 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 +; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 ; RV32-NEXT: vslide1down.vx v11, v8, a7 -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v9, a2 ; RV32-NEXT: vslide1down.vx v10, v8, a3 ; RV32-NEXT: ret ; @@ -1386,32 +1358,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-NEXT: lbu t2, 9(a0) ; RV32-ONLY-NEXT: lbu t3, 10(a0) ; RV32-ONLY-NEXT: lbu t4, 11(a0) +; RV32-ONLY-NEXT: li t5, 255 +; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, t5 ; RV32-ONLY-NEXT: lbu t5, 12(a0) ; RV32-ONLY-NEXT: lbu t6, 13(a0) ; RV32-ONLY-NEXT: lbu s0, 14(a0) ; RV32-ONLY-NEXT: lbu a0, 15(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: vmv.v.x v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v9, v8, t0 -; RV32-ONLY-NEXT: vmv.v.x v8, t1 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 -; RV32-ONLY-NEXT: li a1, 255 -; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, a1 -; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, s0 +; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 +; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 ; RV32-ONLY-NEXT: addi sp, sp, 16 @@ -1420,45 +1391,45 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_loads_contigous: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 0(a0) +; RV32VB-NEXT: lbu a1, 0(a0) +; RV32VB-NEXT: lbu a2, 1(a0) ; RV32VB-NEXT: lbu a3, 2(a0) ; RV32VB-NEXT: lbu a4, 3(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: lbu a5, 4(a0) +; RV32VB-NEXT: lbu a6, 5(a0) +; RV32VB-NEXT: lbu a7, 6(a0) +; RV32VB-NEXT: lbu t0, 7(a0) +; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a4, a4, 24 +; RV32VB-NEXT: slli a6, a6, 8 +; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: or a3, a4, a3 -; RV32VB-NEXT: lbu a2, 4(a0) -; RV32VB-NEXT: lbu a4, 5(a0) -; RV32VB-NEXT: or a1, a1, a3 -; RV32VB-NEXT: lbu a3, 6(a0) -; RV32VB-NEXT: lbu a5, 7(a0) -; RV32VB-NEXT: slli a4, a4, 8 -; RV32VB-NEXT: or a2, a2, a4 -; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a5, a5, 24 -; RV32VB-NEXT: or a3, a5, a3 +; RV32VB-NEXT: or a2, a5, a6 ; RV32VB-NEXT: lbu a4, 8(a0) ; RV32VB-NEXT: lbu a5, 9(a0) -; RV32VB-NEXT: or a2, a2, a3 -; RV32VB-NEXT: lbu a3, 10(a0) -; RV32VB-NEXT: lbu a6, 11(a0) +; RV32VB-NEXT: lbu a6, 10(a0) +; RV32VB-NEXT: lbu t1, 11(a0) +; RV32VB-NEXT: slli a7, a7, 16 +; RV32VB-NEXT: slli t0, t0, 24 ; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: slli a6, a6, 16 +; RV32VB-NEXT: slli t1, t1, 24 +; RV32VB-NEXT: or a7, t0, a7 ; RV32VB-NEXT: or a4, a4, a5 -; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a6, a6, 24 -; RV32VB-NEXT: or a3, a6, a3 ; RV32VB-NEXT: lbu a5, 12(a0) -; RV32VB-NEXT: lbu a6, 13(a0) -; RV32VB-NEXT: or a3, a4, a3 -; RV32VB-NEXT: lbu a4, 14(a0) +; RV32VB-NEXT: lbu t0, 13(a0) +; RV32VB-NEXT: or a6, t1, a6 +; RV32VB-NEXT: lbu t1, 14(a0) ; RV32VB-NEXT: lbu a0, 15(a0) -; RV32VB-NEXT: slli a6, a6, 8 -; RV32VB-NEXT: or a5, a5, a6 -; RV32VB-NEXT: slli a4, a4, 16 +; RV32VB-NEXT: slli t0, t0, 8 +; RV32VB-NEXT: or a5, a5, t0 +; RV32VB-NEXT: slli t1, t1, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a4 +; RV32VB-NEXT: or a0, a0, t1 +; RV32VB-NEXT: or a1, a1, a3 +; RV32VB-NEXT: or a2, a2, a7 +; RV32VB-NEXT: or a3, a4, a6 ; RV32VB-NEXT: or a0, a5, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 @@ -1473,34 +1444,34 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 2(a0) ; RV32VB-PACK-NEXT: lbu a4, 3(a0) +; RV32VB-PACK-NEXT: lbu a5, 4(a0) +; RV32VB-PACK-NEXT: lbu a6, 5(a0) +; RV32VB-PACK-NEXT: lbu a7, 6(a0) +; RV32VB-PACK-NEXT: lbu t0, 7(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 4(a0) -; RV32VB-PACK-NEXT: lbu a5, 5(a0) -; RV32VB-PACK-NEXT: lbu a6, 6(a0) -; RV32VB-PACK-NEXT: lbu a7, 7(a0) +; RV32VB-PACK-NEXT: lbu a2, 8(a0) +; RV32VB-PACK-NEXT: lbu t1, 9(a0) +; RV32VB-PACK-NEXT: lbu t2, 10(a0) +; RV32VB-PACK-NEXT: lbu t3, 11(a0) ; RV32VB-PACK-NEXT: packh a3, a3, a4 -; RV32VB-PACK-NEXT: pack a1, a1, a3 -; RV32VB-PACK-NEXT: packh a2, a2, a5 -; RV32VB-PACK-NEXT: packh a3, a6, a7 -; RV32VB-PACK-NEXT: lbu a4, 8(a0) -; RV32VB-PACK-NEXT: lbu a5, 9(a0) -; RV32VB-PACK-NEXT: pack a2, a2, a3 -; RV32VB-PACK-NEXT: lbu a3, 10(a0) -; RV32VB-PACK-NEXT: lbu a6, 11(a0) -; RV32VB-PACK-NEXT: packh a4, a4, a5 -; RV32VB-PACK-NEXT: lbu a5, 12(a0) +; RV32VB-PACK-NEXT: packh a4, a5, a6 +; RV32VB-PACK-NEXT: packh a5, a7, t0 +; RV32VB-PACK-NEXT: lbu a6, 12(a0) ; RV32VB-PACK-NEXT: lbu a7, 13(a0) ; RV32VB-PACK-NEXT: lbu t0, 14(a0) ; RV32VB-PACK-NEXT: lbu a0, 15(a0) -; RV32VB-PACK-NEXT: packh a3, a3, a6 -; RV32VB-PACK-NEXT: pack a3, a4, a3 -; RV32VB-PACK-NEXT: packh a4, a5, a7 +; RV32VB-PACK-NEXT: packh a2, a2, t1 +; RV32VB-PACK-NEXT: packh t1, t2, t3 +; RV32VB-PACK-NEXT: packh a6, a6, a7 ; RV32VB-PACK-NEXT: packh a0, t0, a0 -; RV32VB-PACK-NEXT: pack a0, a4, a0 +; RV32VB-PACK-NEXT: pack a1, a1, a3 +; RV32VB-PACK-NEXT: pack a3, a4, a5 +; RV32VB-PACK-NEXT: pack a2, a2, t1 +; RV32VB-PACK-NEXT: pack a0, a6, a0 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret ; @@ -1522,32 +1493,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64V-ONLY-NEXT: lbu t2, 9(a0) ; RV64V-ONLY-NEXT: lbu t3, 10(a0) ; RV64V-ONLY-NEXT: lbu t4, 11(a0) +; RV64V-ONLY-NEXT: li t5, 255 +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, t5 ; RV64V-ONLY-NEXT: lbu t5, 12(a0) ; RV64V-ONLY-NEXT: lbu t6, 13(a0) ; RV64V-ONLY-NEXT: lbu s0, 14(a0) ; RV64V-ONLY-NEXT: lbu a0, 15(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: vmv.v.x v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, t0 -; RV64V-ONLY-NEXT: vmv.v.x v8, t1 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 -; RV64V-ONLY-NEXT: li a1, 255 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, a1 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, s0 +; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 ; RV64V-ONLY-NEXT: addi sp, sp, 16 @@ -1556,52 +1526,52 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_loads_contigous: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 0(a0) +; RVA22U64-NEXT: lbu a6, 0(a0) +; RVA22U64-NEXT: lbu a2, 1(a0) ; RVA22U64-NEXT: lbu a3, 2(a0) ; RVA22U64-NEXT: lbu a4, 3(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: lbu a5, 4(a0) +; RVA22U64-NEXT: lbu a1, 5(a0) +; RVA22U64-NEXT: lbu a7, 6(a0) +; RVA22U64-NEXT: lbu t0, 7(a0) +; RVA22U64-NEXT: slli a2, a2, 8 ; RVA22U64-NEXT: slli a3, a3, 16 ; RVA22U64-NEXT: slli a4, a4, 24 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: lbu a2, 4(a0) -; RVA22U64-NEXT: lbu a3, 5(a0) -; RVA22U64-NEXT: lbu a4, 6(a0) -; RVA22U64-NEXT: lbu a5, 7(a0) -; RVA22U64-NEXT: slli a2, a2, 32 -; RVA22U64-NEXT: slli a3, a3, 40 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: slli a4, a4, 48 -; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: or a4, a4, a5 -; RVA22U64-NEXT: or a2, a2, a4 -; RVA22U64-NEXT: lbu a3, 8(a0) -; RVA22U64-NEXT: lbu a4, 9(a0) -; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: slli a5, a5, 32 +; RVA22U64-NEXT: slli a1, a1, 40 +; RVA22U64-NEXT: or a6, a6, a2 +; RVA22U64-NEXT: or t2, a4, a3 +; RVA22U64-NEXT: or t1, a1, a5 +; RVA22U64-NEXT: lbu a4, 8(a0) +; RVA22U64-NEXT: lbu a5, 9(a0) ; RVA22U64-NEXT: lbu a2, 10(a0) -; RVA22U64-NEXT: lbu a5, 11(a0) -; RVA22U64-NEXT: slli a4, a4, 8 -; RVA22U64-NEXT: or a3, a3, a4 +; RVA22U64-NEXT: lbu a1, 11(a0) +; RVA22U64-NEXT: slli a7, a7, 48 +; RVA22U64-NEXT: slli t0, t0, 56 +; RVA22U64-NEXT: slli a5, a5, 8 ; RVA22U64-NEXT: slli a2, a2, 16 -; RVA22U64-NEXT: slli a5, a5, 24 -; RVA22U64-NEXT: or a2, a2, a5 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: lbu a3, 12(a0) -; RVA22U64-NEXT: lbu a4, 13(a0) -; RVA22U64-NEXT: lbu a5, 14(a0) +; RVA22U64-NEXT: slli a1, a1, 24 +; RVA22U64-NEXT: or a7, t0, a7 +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: lbu a2, 12(a0) +; RVA22U64-NEXT: lbu a5, 13(a0) +; RVA22U64-NEXT: lbu a3, 14(a0) ; RVA22U64-NEXT: lbu a0, 15(a0) -; RVA22U64-NEXT: slli a3, a3, 32 -; RVA22U64-NEXT: slli a4, a4, 40 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: slli a5, a5, 48 +; RVA22U64-NEXT: slli a2, a2, 32 +; RVA22U64-NEXT: slli a5, a5, 40 +; RVA22U64-NEXT: or a2, a2, a5 +; RVA22U64-NEXT: slli a3, a3, 48 ; RVA22U64-NEXT: slli a0, a0, 56 -; RVA22U64-NEXT: or a0, a0, a5 ; RVA22U64-NEXT: or a0, a0, a3 +; RVA22U64-NEXT: or a3, a6, t2 +; RVA22U64-NEXT: or a5, a7, t1 +; RVA22U64-NEXT: or a1, a1, a4 ; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a3, a3, a5 +; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a1 +; RVA22U64-NEXT: vmv.v.x v8, a3 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; @@ -1610,35 +1580,35 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RVA22U64-PACK-NEXT: lbu a1, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) ; RVA22U64-PACK-NEXT: lbu a6, 2(a0) -; RVA22U64-PACK-NEXT: lbu a4, 3(a0) -; RVA22U64-PACK-NEXT: packh a7, a1, a2 -; RVA22U64-PACK-NEXT: lbu a2, 4(a0) -; RVA22U64-PACK-NEXT: lbu a5, 5(a0) -; RVA22U64-PACK-NEXT: lbu a3, 6(a0) -; RVA22U64-PACK-NEXT: lbu a1, 7(a0) -; RVA22U64-PACK-NEXT: packh a4, a6, a4 -; RVA22U64-PACK-NEXT: packw a4, a7, a4 -; RVA22U64-PACK-NEXT: packh a2, a2, a5 -; RVA22U64-PACK-NEXT: packh a1, a3, a1 -; RVA22U64-PACK-NEXT: packw a1, a2, a1 -; RVA22U64-PACK-NEXT: lbu a2, 8(a0) -; RVA22U64-PACK-NEXT: lbu a3, 9(a0) -; RVA22U64-PACK-NEXT: pack a6, a4, a1 -; RVA22U64-PACK-NEXT: lbu a7, 10(a0) -; RVA22U64-PACK-NEXT: lbu a5, 11(a0) -; RVA22U64-PACK-NEXT: packh a2, a2, a3 -; RVA22U64-PACK-NEXT: lbu a3, 12(a0) -; RVA22U64-PACK-NEXT: lbu a1, 13(a0) -; RVA22U64-PACK-NEXT: lbu a4, 14(a0) +; RVA22U64-PACK-NEXT: lbu a7, 3(a0) +; RVA22U64-PACK-NEXT: lbu t0, 4(a0) +; RVA22U64-PACK-NEXT: lbu a3, 5(a0) +; RVA22U64-PACK-NEXT: lbu a4, 6(a0) +; RVA22U64-PACK-NEXT: lbu a5, 7(a0) +; RVA22U64-PACK-NEXT: packh t1, a1, a2 +; RVA22U64-PACK-NEXT: lbu t2, 8(a0) +; RVA22U64-PACK-NEXT: lbu t3, 9(a0) +; RVA22U64-PACK-NEXT: lbu t4, 10(a0) +; RVA22U64-PACK-NEXT: lbu a1, 11(a0) +; RVA22U64-PACK-NEXT: packh a6, a6, a7 +; RVA22U64-PACK-NEXT: packh a7, t0, a3 +; RVA22U64-PACK-NEXT: packh t0, a4, a5 +; RVA22U64-PACK-NEXT: lbu a5, 12(a0) +; RVA22U64-PACK-NEXT: lbu a3, 13(a0) +; RVA22U64-PACK-NEXT: lbu a2, 14(a0) ; RVA22U64-PACK-NEXT: lbu a0, 15(a0) -; RVA22U64-PACK-NEXT: packh a5, a7, a5 -; RVA22U64-PACK-NEXT: packw a2, a2, a5 -; RVA22U64-PACK-NEXT: packh a1, a3, a1 -; RVA22U64-PACK-NEXT: packh a0, a4, a0 -; RVA22U64-PACK-NEXT: packw a0, a1, a0 -; RVA22U64-PACK-NEXT: pack a0, a2, a0 +; RVA22U64-PACK-NEXT: packh a4, t2, t3 +; RVA22U64-PACK-NEXT: packh a1, t4, a1 +; RVA22U64-PACK-NEXT: packh a3, a5, a3 +; RVA22U64-PACK-NEXT: packh a0, a2, a0 +; RVA22U64-PACK-NEXT: packw a2, t1, a6 +; RVA22U64-PACK-NEXT: packw a5, a7, t0 +; RVA22U64-PACK-NEXT: packw a1, a4, a1 +; RVA22U64-PACK-NEXT: packw a0, a3, a0 +; RVA22U64-PACK-NEXT: pack a2, a2, a5 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-PACK-NEXT: vmv.v.x v8, a6 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a2 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -1660,32 +1630,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64ZVE32-NEXT: lbu t2, 9(a0) ; RV64ZVE32-NEXT: lbu t3, 10(a0) ; RV64ZVE32-NEXT: lbu t4, 11(a0) +; RV64ZVE32-NEXT: li t5, 255 +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, t5 ; RV64ZVE32-NEXT: lbu t5, 12(a0) ; RV64ZVE32-NEXT: lbu t6, 13(a0) ; RV64ZVE32-NEXT: lbu s0, 14(a0) ; RV64ZVE32-NEXT: lbu a0, 15(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: vmv.v.x v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v9, v8, t0 -; RV64ZVE32-NEXT: vmv.v.x v8, t1 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 -; RV64ZVE32-NEXT: li a1, 255 -; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, a1 -; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, s0 +; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 ; RV64ZVE32-NEXT: addi sp, sp, 16 @@ -1763,32 +1732,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-NEXT: lbu t2, 154(a0) ; RV32-ONLY-NEXT: lbu t3, 161(a0) ; RV32-ONLY-NEXT: lbu t4, 163(a0) +; RV32-ONLY-NEXT: li t5, 255 +; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, t5 ; RV32-ONLY-NEXT: lbu t5, 93(a0) ; RV32-ONLY-NEXT: lbu t6, 105(a0) ; RV32-ONLY-NEXT: lbu s0, 124(a0) ; RV32-ONLY-NEXT: lbu a0, 144(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: vmv.v.x v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, s0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t4 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v9, v8, t0 -; RV32-ONLY-NEXT: vmv.v.x v8, t1 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: li a0, 255 -; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, a0 -; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 +; RV32-ONLY-NEXT: vslide1down.vx v8, v9, t2 +; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 ; RV32-ONLY-NEXT: addi sp, sp, 16 @@ -1797,50 +1765,50 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_loads_gather: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 0(a0) +; RV32VB-NEXT: lbu a1, 0(a0) +; RV32VB-NEXT: lbu a2, 1(a0) ; RV32VB-NEXT: lbu a3, 22(a0) ; RV32VB-NEXT: lbu a4, 31(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: lbu a5, 623(a0) +; RV32VB-NEXT: lbu a6, 44(a0) +; RV32VB-NEXT: lbu a7, 55(a0) +; RV32VB-NEXT: lbu t0, 75(a0) +; RV32VB-NEXT: lbu t1, 82(a0) +; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a4, a4, 24 +; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: or a3, a4, a3 -; RV32VB-NEXT: or a1, a1, a3 -; RV32VB-NEXT: lbu a2, 44(a0) -; RV32VB-NEXT: lbu a3, 55(a0) -; RV32VB-NEXT: lbu a4, 623(a0) -; RV32VB-NEXT: lbu a5, 75(a0) -; RV32VB-NEXT: lbu a6, 82(a0) -; RV32VB-NEXT: slli a3, a3, 8 -; RV32VB-NEXT: or a2, a2, a3 -; RV32VB-NEXT: slli a4, a4, 16 -; RV32VB-NEXT: slli a5, a5, 24 -; RV32VB-NEXT: or a4, a5, a4 -; RV32VB-NEXT: or a2, a2, a4 -; RV32VB-NEXT: lbu a3, 93(a0) +; RV32VB-NEXT: lbu a2, 93(a0) ; RV32VB-NEXT: lbu a4, 105(a0) -; RV32VB-NEXT: lbu a5, 124(a0) -; RV32VB-NEXT: lbu a7, 144(a0) -; RV32VB-NEXT: slli a3, a3, 8 -; RV32VB-NEXT: lbu t0, 154(a0) -; RV32VB-NEXT: lbu t1, 161(a0) -; RV32VB-NEXT: or a3, a6, a3 -; RV32VB-NEXT: slli a4, a4, 16 +; RV32VB-NEXT: lbu t2, 124(a0) +; RV32VB-NEXT: lbu t3, 144(a0) +; RV32VB-NEXT: slli a7, a7, 8 +; RV32VB-NEXT: slli a5, a5, 16 +; RV32VB-NEXT: slli t0, t0, 24 +; RV32VB-NEXT: slli a2, a2, 8 +; RV32VB-NEXT: or a6, a6, a7 +; RV32VB-NEXT: or a5, t0, a5 +; RV32VB-NEXT: lbu a7, 154(a0) +; RV32VB-NEXT: lbu t0, 161(a0) +; RV32VB-NEXT: or a2, t1, a2 ; RV32VB-NEXT: lbu a0, 163(a0) -; RV32VB-NEXT: slli t1, t1, 24 -; RV32VB-NEXT: or a4, t1, a4 -; RV32VB-NEXT: or a3, a3, a4 -; RV32VB-NEXT: slli a0, a0, 8 -; RV32VB-NEXT: or a0, a5, a0 -; RV32VB-NEXT: slli a7, a7, 16 +; RV32VB-NEXT: slli a4, a4, 16 ; RV32VB-NEXT: slli t0, t0, 24 -; RV32VB-NEXT: or a4, t0, a7 -; RV32VB-NEXT: or a0, a0, a4 +; RV32VB-NEXT: or a4, t0, a4 +; RV32VB-NEXT: slli a0, a0, 8 +; RV32VB-NEXT: or a0, t2, a0 +; RV32VB-NEXT: slli t3, t3, 16 +; RV32VB-NEXT: slli a7, a7, 24 +; RV32VB-NEXT: or a7, a7, t3 +; RV32VB-NEXT: or a1, a1, a3 +; RV32VB-NEXT: or a3, a6, a5 +; RV32VB-NEXT: or a2, a2, a4 +; RV32VB-NEXT: or a0, a0, a7 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 -; RV32VB-NEXT: vslide1down.vx v8, v8, a2 ; RV32VB-NEXT: vslide1down.vx v8, v8, a3 +; RV32VB-NEXT: vslide1down.vx v8, v8, a2 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -1850,34 +1818,34 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 22(a0) ; RV32VB-PACK-NEXT: lbu a4, 31(a0) +; RV32VB-PACK-NEXT: lbu a5, 623(a0) +; RV32VB-PACK-NEXT: lbu a6, 44(a0) +; RV32VB-PACK-NEXT: lbu a7, 55(a0) +; RV32VB-PACK-NEXT: lbu t0, 75(a0) +; RV32VB-PACK-NEXT: lbu t1, 82(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: packh a2, a3, a4 -; RV32VB-PACK-NEXT: lbu a3, 623(a0) -; RV32VB-PACK-NEXT: lbu a4, 44(a0) -; RV32VB-PACK-NEXT: lbu a5, 55(a0) -; RV32VB-PACK-NEXT: lbu a6, 75(a0) -; RV32VB-PACK-NEXT: pack a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 82(a0) -; RV32VB-PACK-NEXT: packh a4, a4, a5 -; RV32VB-PACK-NEXT: packh a3, a3, a6 -; RV32VB-PACK-NEXT: pack a3, a4, a3 -; RV32VB-PACK-NEXT: lbu a4, 154(a0) -; RV32VB-PACK-NEXT: lbu a5, 161(a0) -; RV32VB-PACK-NEXT: lbu a6, 163(a0) -; RV32VB-PACK-NEXT: lbu a7, 93(a0) -; RV32VB-PACK-NEXT: lbu t0, 105(a0) -; RV32VB-PACK-NEXT: lbu t1, 124(a0) +; RV32VB-PACK-NEXT: lbu a2, 154(a0) +; RV32VB-PACK-NEXT: lbu t2, 161(a0) +; RV32VB-PACK-NEXT: lbu t3, 163(a0) +; RV32VB-PACK-NEXT: packh a3, a3, a4 +; RV32VB-PACK-NEXT: packh a4, a6, a7 +; RV32VB-PACK-NEXT: packh a5, a5, t0 +; RV32VB-PACK-NEXT: lbu a6, 93(a0) +; RV32VB-PACK-NEXT: lbu a7, 105(a0) +; RV32VB-PACK-NEXT: lbu t0, 124(a0) ; RV32VB-PACK-NEXT: lbu a0, 144(a0) -; RV32VB-PACK-NEXT: packh a2, a2, a7 -; RV32VB-PACK-NEXT: packh a5, t0, a5 -; RV32VB-PACK-NEXT: pack a2, a2, a5 -; RV32VB-PACK-NEXT: packh a5, t1, a6 -; RV32VB-PACK-NEXT: packh a0, a0, a4 -; RV32VB-PACK-NEXT: pack a0, a5, a0 +; RV32VB-PACK-NEXT: packh a6, t1, a6 +; RV32VB-PACK-NEXT: packh a7, a7, t2 +; RV32VB-PACK-NEXT: packh t0, t0, t3 +; RV32VB-PACK-NEXT: packh a0, a0, a2 +; RV32VB-PACK-NEXT: pack a1, a1, a3 +; RV32VB-PACK-NEXT: pack a2, a4, a5 +; RV32VB-PACK-NEXT: pack a3, a6, a7 +; RV32VB-PACK-NEXT: pack a0, t0, a0 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret ; @@ -1899,32 +1867,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64V-ONLY-NEXT: lbu t2, 154(a0) ; RV64V-ONLY-NEXT: lbu t3, 161(a0) ; RV64V-ONLY-NEXT: lbu t4, 163(a0) +; RV64V-ONLY-NEXT: li t5, 255 +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, t5 ; RV64V-ONLY-NEXT: lbu t5, 93(a0) ; RV64V-ONLY-NEXT: lbu t6, 105(a0) ; RV64V-ONLY-NEXT: lbu s0, 124(a0) ; RV64V-ONLY-NEXT: lbu a0, 144(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: vmv.v.x v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, s0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t4 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, t0 -; RV64V-ONLY-NEXT: vmv.v.x v8, t1 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: li a0, 255 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, a0 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, t2 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 ; RV64V-ONLY-NEXT: addi sp, sp, 16 @@ -1933,90 +1900,98 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_loads_gather: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 0(a0) +; RVA22U64-NEXT: lbu a1, 0(a0) +; RVA22U64-NEXT: lbu a2, 1(a0) ; RVA22U64-NEXT: lbu a3, 22(a0) ; RVA22U64-NEXT: lbu a4, 31(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: lbu a6, 623(a0) +; RVA22U64-NEXT: lbu t0, 44(a0) +; RVA22U64-NEXT: lbu a7, 55(a0) +; RVA22U64-NEXT: lbu a5, 75(a0) +; RVA22U64-NEXT: lbu t1, 82(a0) +; RVA22U64-NEXT: slli a2, a2, 8 ; RVA22U64-NEXT: slli a3, a3, 16 ; RVA22U64-NEXT: slli a4, a4, 24 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: lbu a2, 623(a0) -; RVA22U64-NEXT: lbu a3, 44(a0) -; RVA22U64-NEXT: lbu a4, 55(a0) -; RVA22U64-NEXT: lbu a5, 75(a0) -; RVA22U64-NEXT: lbu a6, 82(a0) -; RVA22U64-NEXT: slli a3, a3, 32 -; RVA22U64-NEXT: slli a4, a4, 40 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: slli a2, a2, 48 -; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: or a2, a2, a5 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: or a7, a1, a2 +; RVA22U64-NEXT: or t2, a1, a2 +; RVA22U64-NEXT: or t3, a4, a3 ; RVA22U64-NEXT: lbu a2, 93(a0) -; RVA22U64-NEXT: lbu t0, 105(a0) -; RVA22U64-NEXT: lbu a4, 124(a0) -; RVA22U64-NEXT: lbu a5, 144(a0) +; RVA22U64-NEXT: lbu t4, 105(a0) +; RVA22U64-NEXT: lbu t6, 124(a0) +; RVA22U64-NEXT: lbu t5, 144(a0) +; RVA22U64-NEXT: slli t0, t0, 32 +; RVA22U64-NEXT: slli a7, a7, 40 +; RVA22U64-NEXT: slli a6, a6, 48 +; RVA22U64-NEXT: slli a5, a5, 56 ; RVA22U64-NEXT: slli a2, a2, 8 -; RVA22U64-NEXT: lbu a1, 154(a0) -; RVA22U64-NEXT: lbu a3, 161(a0) -; RVA22U64-NEXT: or a2, a6, a2 -; RVA22U64-NEXT: slli t0, t0, 16 +; RVA22U64-NEXT: or a7, a7, t0 +; RVA22U64-NEXT: or a5, a5, a6 +; RVA22U64-NEXT: lbu a3, 154(a0) +; RVA22U64-NEXT: lbu a1, 161(a0) +; RVA22U64-NEXT: or a2, t1, a2 ; RVA22U64-NEXT: lbu a0, 163(a0) -; RVA22U64-NEXT: slli a3, a3, 24 -; RVA22U64-NEXT: or a3, a3, t0 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: slli a4, a4, 32 +; RVA22U64-NEXT: slli t4, t4, 16 +; RVA22U64-NEXT: slli a1, a1, 24 +; RVA22U64-NEXT: or a1, a1, t4 +; RVA22U64-NEXT: slli t6, t6, 32 ; RVA22U64-NEXT: slli a0, a0, 40 -; RVA22U64-NEXT: or a0, a0, a4 -; RVA22U64-NEXT: slli a5, a5, 48 -; RVA22U64-NEXT: slli a1, a1, 56 -; RVA22U64-NEXT: or a1, a1, a5 +; RVA22U64-NEXT: or a0, a0, t6 +; RVA22U64-NEXT: slli t5, t5, 48 +; RVA22U64-NEXT: slli a3, a3, 56 +; RVA22U64-NEXT: or a3, a3, t5 +; RVA22U64-NEXT: or a4, t2, t3 +; RVA22U64-NEXT: or a5, a5, a7 +; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: or a0, a0, a3 +; RVA22U64-NEXT: or a4, a4, a5 ; RVA22U64-NEXT: or a0, a0, a1 -; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a7 +; RVA22U64-NEXT: vmv.v.x v8, a4 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_gather: ; RVA22U64-PACK: # %bb.0: +; RVA22U64-PACK-NEXT: addi sp, sp, -16 +; RVA22U64-PACK-NEXT: .cfi_def_cfa_offset 16 +; RVA22U64-PACK-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RVA22U64-PACK-NEXT: .cfi_offset s0, -8 ; RVA22U64-PACK-NEXT: lbu a1, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a3, 22(a0) -; RVA22U64-PACK-NEXT: lbu a4, 31(a0) -; RVA22U64-PACK-NEXT: packh a6, a1, a2 -; RVA22U64-PACK-NEXT: packh a2, a3, a4 -; RVA22U64-PACK-NEXT: lbu a3, 623(a0) -; RVA22U64-PACK-NEXT: lbu a4, 44(a0) -; RVA22U64-PACK-NEXT: lbu a5, 55(a0) -; RVA22U64-PACK-NEXT: lbu a1, 75(a0) -; RVA22U64-PACK-NEXT: packw a2, a6, a2 -; RVA22U64-PACK-NEXT: lbu a6, 82(a0) -; RVA22U64-PACK-NEXT: packh a4, a4, a5 -; RVA22U64-PACK-NEXT: packh a1, a3, a1 -; RVA22U64-PACK-NEXT: packw a1, a4, a1 -; RVA22U64-PACK-NEXT: pack a7, a2, a1 -; RVA22U64-PACK-NEXT: lbu t0, 154(a0) -; RVA22U64-PACK-NEXT: lbu a3, 161(a0) -; RVA22U64-PACK-NEXT: lbu a4, 163(a0) -; RVA22U64-PACK-NEXT: lbu a5, 93(a0) -; RVA22U64-PACK-NEXT: lbu a1, 105(a0) -; RVA22U64-PACK-NEXT: lbu a2, 124(a0) +; RVA22U64-PACK-NEXT: lbu a6, 22(a0) +; RVA22U64-PACK-NEXT: lbu a7, 31(a0) +; RVA22U64-PACK-NEXT: lbu t0, 623(a0) +; RVA22U64-PACK-NEXT: lbu t3, 44(a0) +; RVA22U64-PACK-NEXT: lbu t4, 55(a0) +; RVA22U64-PACK-NEXT: lbu t5, 75(a0) +; RVA22U64-PACK-NEXT: lbu t1, 82(a0) +; RVA22U64-PACK-NEXT: packh t2, a1, a2 +; RVA22U64-PACK-NEXT: lbu t6, 154(a0) +; RVA22U64-PACK-NEXT: lbu s0, 161(a0) +; RVA22U64-PACK-NEXT: lbu a3, 163(a0) +; RVA22U64-PACK-NEXT: packh a6, a6, a7 +; RVA22U64-PACK-NEXT: packh a7, t3, t4 +; RVA22U64-PACK-NEXT: packh a2, t0, t5 +; RVA22U64-PACK-NEXT: lbu a4, 93(a0) +; RVA22U64-PACK-NEXT: lbu a5, 105(a0) +; RVA22U64-PACK-NEXT: lbu a1, 124(a0) ; RVA22U64-PACK-NEXT: lbu a0, 144(a0) -; RVA22U64-PACK-NEXT: packh a5, a6, a5 +; RVA22U64-PACK-NEXT: packh a4, t1, a4 +; RVA22U64-PACK-NEXT: packh a5, a5, s0 ; RVA22U64-PACK-NEXT: packh a1, a1, a3 -; RVA22U64-PACK-NEXT: packw a1, a5, a1 -; RVA22U64-PACK-NEXT: packh a2, a2, a4 -; RVA22U64-PACK-NEXT: packh a0, a0, t0 -; RVA22U64-PACK-NEXT: packw a0, a2, a0 -; RVA22U64-PACK-NEXT: pack a0, a1, a0 +; RVA22U64-PACK-NEXT: packh a0, a0, t6 +; RVA22U64-PACK-NEXT: packw a3, t2, a6 +; RVA22U64-PACK-NEXT: packw a2, a7, a2 +; RVA22U64-PACK-NEXT: packw a4, a4, a5 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a1, a3, a2 +; RVA22U64-PACK-NEXT: pack a0, a4, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-PACK-NEXT: vmv.v.x v8, a7 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 +; RVA22U64-PACK-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RVA22U64-PACK-NEXT: .cfi_restore s0 +; RVA22U64-PACK-NEXT: addi sp, sp, 16 +; RVA22U64-PACK-NEXT: .cfi_def_cfa_offset 0 ; RVA22U64-PACK-NEXT: ret ; ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather: @@ -2037,32 +2012,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64ZVE32-NEXT: lbu t2, 154(a0) ; RV64ZVE32-NEXT: lbu t3, 161(a0) ; RV64ZVE32-NEXT: lbu t4, 163(a0) +; RV64ZVE32-NEXT: li t5, 255 +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, t5 ; RV64ZVE32-NEXT: lbu t5, 93(a0) ; RV64ZVE32-NEXT: lbu t6, 105(a0) ; RV64ZVE32-NEXT: lbu s0, 124(a0) ; RV64ZVE32-NEXT: lbu a0, 144(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: vmv.v.x v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, s0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t4 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v9, v8, t0 -; RV64ZVE32-NEXT: vmv.v.x v8, t1 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: li a0, 255 -; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, a0 -; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 +; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v9, t2 +; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 ; RV64ZVE32-NEXT: addi sp, sp, 16 @@ -2153,19 +2127,19 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; RV32VB-NEXT: lbu a6, 154(a0) ; RV32VB-NEXT: lbu a7, 161(a0) ; RV32VB-NEXT: or a1, a2, a1 -; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: lbu a0, 163(a0) +; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a7, a7, 24 ; RV32VB-NEXT: or a2, a7, a3 -; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: slli a0, a0, 8 ; RV32VB-NEXT: or a0, a4, a0 ; RV32VB-NEXT: slli a5, a5, 16 ; RV32VB-NEXT: slli a6, a6, 24 -; RV32VB-NEXT: or a2, a6, a5 -; RV32VB-NEXT: or a0, a0, a2 +; RV32VB-NEXT: or a3, a6, a5 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.i v8, 0 +; RV32VB-NEXT: or a1, a1, a2 +; RV32VB-NEXT: or a0, a0, a3 ; RV32VB-NEXT: vslide1down.vx v8, v8, zero ; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 @@ -2173,26 +2147,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RV32VB-PACK-LABEL: buildvec_v16i8_undef_low_half: ; RV32VB-PACK: # %bb.0: -; RV32VB-PACK-NEXT: lbu a1, 144(a0) -; RV32VB-PACK-NEXT: lbu a2, 154(a0) -; RV32VB-PACK-NEXT: lbu a3, 161(a0) -; RV32VB-PACK-NEXT: lbu a4, 82(a0) -; RV32VB-PACK-NEXT: lbu a5, 93(a0) -; RV32VB-PACK-NEXT: lbu a6, 105(a0) -; RV32VB-PACK-NEXT: lbu a7, 124(a0) -; RV32VB-PACK-NEXT: lbu a0, 163(a0) -; RV32VB-PACK-NEXT: packh a4, a4, a5 -; RV32VB-PACK-NEXT: packh a3, a6, a3 -; RV32VB-PACK-NEXT: pack a3, a4, a3 -; RV32VB-PACK-NEXT: packh a0, a7, a0 +; RV32VB-PACK-NEXT: lbu a1, 82(a0) +; RV32VB-PACK-NEXT: lbu a2, 93(a0) +; RV32VB-PACK-NEXT: lbu a3, 105(a0) +; RV32VB-PACK-NEXT: lbu a4, 124(a0) +; RV32VB-PACK-NEXT: lbu a5, 161(a0) +; RV32VB-PACK-NEXT: lbu a6, 163(a0) +; RV32VB-PACK-NEXT: lbu a7, 144(a0) +; RV32VB-PACK-NEXT: lbu a0, 154(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: pack a0, a0, a1 -; RV32VB-PACK-NEXT: packh a1, a0, a0 -; RV32VB-PACK-NEXT: pack a1, a1, a1 +; RV32VB-PACK-NEXT: packh a2, a3, a5 +; RV32VB-PACK-NEXT: packh a3, a4, a6 +; RV32VB-PACK-NEXT: packh a0, a7, a0 +; RV32VB-PACK-NEXT: pack a1, a1, a2 +; RV32VB-PACK-NEXT: packh a2, a0, a0 +; RV32VB-PACK-NEXT: pack a2, a2, a2 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-PACK-NEXT: vmv.v.x v8, a1 +; RV32VB-PACK-NEXT: vmv.v.x v8, a2 +; RV32VB-PACK-NEXT: pack a0, a3, a0 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret ; @@ -2228,17 +2202,17 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; RVA22U64-NEXT: lbu a2, 154(a0) ; RVA22U64-NEXT: lbu a3, 161(a0) ; RVA22U64-NEXT: or a1, a6, a1 -; RVA22U64-NEXT: slli a7, a7, 16 ; RVA22U64-NEXT: lbu a0, 163(a0) +; RVA22U64-NEXT: slli a7, a7, 16 ; RVA22U64-NEXT: slli a3, a3, 24 ; RVA22U64-NEXT: or a3, a3, a7 -; RVA22U64-NEXT: or a1, a1, a3 ; RVA22U64-NEXT: slli a4, a4, 32 ; RVA22U64-NEXT: slli a0, a0, 40 ; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: slli a5, a5, 48 ; RVA22U64-NEXT: slli a2, a2, 56 ; RVA22U64-NEXT: or a2, a2, a5 +; RVA22U64-NEXT: or a1, a1, a3 ; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -2248,26 +2222,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a6, 144(a0) -; RVA22U64-PACK-NEXT: lbu a7, 154(a0) -; RVA22U64-PACK-NEXT: lbu a3, 161(a0) -; RVA22U64-PACK-NEXT: lbu a4, 82(a0) -; RVA22U64-PACK-NEXT: lbu a5, 93(a0) -; RVA22U64-PACK-NEXT: lbu a1, 105(a0) -; RVA22U64-PACK-NEXT: lbu a2, 124(a0) -; RVA22U64-PACK-NEXT: lbu a0, 163(a0) -; RVA22U64-PACK-NEXT: packh a4, a4, a5 -; RVA22U64-PACK-NEXT: packh a1, a1, a3 -; RVA22U64-PACK-NEXT: packw a1, a4, a1 +; RVA22U64-PACK-NEXT: lbu a6, 82(a0) +; RVA22U64-PACK-NEXT: lbu a7, 93(a0) +; RVA22U64-PACK-NEXT: lbu t0, 105(a0) +; RVA22U64-PACK-NEXT: lbu a4, 124(a0) +; RVA22U64-PACK-NEXT: lbu a5, 161(a0) +; RVA22U64-PACK-NEXT: lbu a1, 163(a0) +; RVA22U64-PACK-NEXT: lbu a2, 144(a0) +; RVA22U64-PACK-NEXT: lbu a0, 154(a0) +; RVA22U64-PACK-NEXT: packh a3, a6, a7 +; RVA22U64-PACK-NEXT: packh a5, t0, a5 +; RVA22U64-PACK-NEXT: packh a1, a4, a1 ; RVA22U64-PACK-NEXT: packh a0, a2, a0 -; RVA22U64-PACK-NEXT: packh a2, a6, a7 -; RVA22U64-PACK-NEXT: packw a0, a0, a2 -; RVA22U64-PACK-NEXT: pack a0, a1, a0 -; RVA22U64-PACK-NEXT: packh a1, a0, a0 -; RVA22U64-PACK-NEXT: packw a1, a1, a1 -; RVA22U64-PACK-NEXT: pack a1, a1, a1 +; RVA22U64-PACK-NEXT: packw a2, a3, a5 +; RVA22U64-PACK-NEXT: packh a3, a0, a0 +; RVA22U64-PACK-NEXT: packw a3, a3, a3 +; RVA22U64-PACK-NEXT: pack a3, a3, a3 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a0, a2, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a3 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2346,25 +2320,25 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; RV32VB-LABEL: buildvec_v16i8_undef_high_half: ; RV32VB: # %bb.0: ; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 0(a0) -; RV32VB-NEXT: lbu a3, 22(a0) -; RV32VB-NEXT: lbu a4, 31(a0) +; RV32VB-NEXT: lbu a2, 22(a0) +; RV32VB-NEXT: lbu a3, 31(a0) +; RV32VB-NEXT: lbu a4, 0(a0) ; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a2, a1 -; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a4, a4, 24 -; RV32VB-NEXT: or a3, a4, a3 -; RV32VB-NEXT: lbu a2, 44(a0) -; RV32VB-NEXT: lbu a4, 55(a0) -; RV32VB-NEXT: or a1, a1, a3 +; RV32VB-NEXT: slli a2, a2, 16 +; RV32VB-NEXT: slli a3, a3, 24 +; RV32VB-NEXT: or a1, a4, a1 +; RV32VB-NEXT: lbu a4, 44(a0) +; RV32VB-NEXT: lbu a5, 55(a0) +; RV32VB-NEXT: or a2, a3, a2 ; RV32VB-NEXT: lbu a3, 623(a0) ; RV32VB-NEXT: lbu a0, 75(a0) -; RV32VB-NEXT: slli a4, a4, 8 -; RV32VB-NEXT: or a2, a2, a4 +; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: or a4, a4, a5 ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a0, a0, 24 ; RV32VB-NEXT: or a0, a0, a3 -; RV32VB-NEXT: or a0, a2, a0 +; RV32VB-NEXT: or a1, a1, a2 +; RV32VB-NEXT: or a0, a4, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 @@ -2378,21 +2352,21 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 22(a0) ; RV32VB-PACK-NEXT: lbu a4, 31(a0) -; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 623(a0) -; RV32VB-PACK-NEXT: lbu a5, 44(a0) -; RV32VB-PACK-NEXT: lbu a6, 55(a0) +; RV32VB-PACK-NEXT: lbu a5, 623(a0) +; RV32VB-PACK-NEXT: lbu a6, 44(a0) +; RV32VB-PACK-NEXT: lbu a7, 55(a0) ; RV32VB-PACK-NEXT: lbu a0, 75(a0) -; RV32VB-PACK-NEXT: packh a3, a3, a4 -; RV32VB-PACK-NEXT: pack a1, a1, a3 -; RV32VB-PACK-NEXT: packh a3, a5, a6 -; RV32VB-PACK-NEXT: packh a0, a2, a0 +; RV32VB-PACK-NEXT: packh a1, a1, a2 +; RV32VB-PACK-NEXT: packh a2, a3, a4 +; RV32VB-PACK-NEXT: packh a3, a6, a7 +; RV32VB-PACK-NEXT: packh a0, a5, a0 +; RV32VB-PACK-NEXT: pack a1, a1, a2 +; RV32VB-PACK-NEXT: packh a2, a0, a0 ; RV32VB-PACK-NEXT: pack a0, a3, a0 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 -; RV32VB-PACK-NEXT: packh a0, a0, a0 -; RV32VB-PACK-NEXT: pack a0, a0, a0 +; RV32VB-PACK-NEXT: pack a0, a2, a2 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret @@ -2422,26 +2396,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; RVA22U64-LABEL: buildvec_v16i8_undef_high_half: ; RVA22U64: # %bb.0: ; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 0(a0) -; RVA22U64-NEXT: lbu a3, 22(a0) -; RVA22U64-NEXT: lbu a4, 31(a0) +; RVA22U64-NEXT: lbu a2, 22(a0) +; RVA22U64-NEXT: lbu a3, 31(a0) +; RVA22U64-NEXT: lbu a4, 0(a0) ; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a2 -; RVA22U64-NEXT: slli a3, a3, 16 -; RVA22U64-NEXT: slli a4, a4, 24 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: lbu a2, 44(a0) -; RVA22U64-NEXT: lbu a3, 55(a0) -; RVA22U64-NEXT: lbu a4, 623(a0) -; RVA22U64-NEXT: lbu a0, 75(a0) -; RVA22U64-NEXT: slli a2, a2, 32 -; RVA22U64-NEXT: slli a3, a3, 40 +; RVA22U64-NEXT: slli a2, a2, 16 +; RVA22U64-NEXT: slli a3, a3, 24 +; RVA22U64-NEXT: or a1, a1, a4 ; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: lbu a3, 44(a0) +; RVA22U64-NEXT: lbu a4, 55(a0) +; RVA22U64-NEXT: lbu a5, 623(a0) +; RVA22U64-NEXT: lbu a0, 75(a0) +; RVA22U64-NEXT: slli a3, a3, 32 +; RVA22U64-NEXT: slli a4, a4, 40 +; RVA22U64-NEXT: or a3, a3, a4 +; RVA22U64-NEXT: slli a5, a5, 48 ; RVA22U64-NEXT: slli a0, a0, 56 -; RVA22U64-NEXT: or a0, a0, a4 -; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a0, a0, a5 +; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: or a0, a0, a3 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.v.x v8, a0 @@ -2450,26 +2424,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_high_half: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a1, 0(a0) -; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a6, 22(a0) +; RVA22U64-PACK-NEXT: lbu a6, 0(a0) +; RVA22U64-PACK-NEXT: lbu a7, 1(a0) +; RVA22U64-PACK-NEXT: lbu t0, 22(a0) ; RVA22U64-PACK-NEXT: lbu a4, 31(a0) -; RVA22U64-PACK-NEXT: packh a1, a1, a2 -; RVA22U64-PACK-NEXT: lbu a2, 623(a0) -; RVA22U64-PACK-NEXT: lbu a5, 44(a0) -; RVA22U64-PACK-NEXT: lbu a3, 55(a0) +; RVA22U64-PACK-NEXT: lbu a5, 623(a0) +; RVA22U64-PACK-NEXT: lbu a1, 44(a0) +; RVA22U64-PACK-NEXT: lbu a2, 55(a0) ; RVA22U64-PACK-NEXT: lbu a0, 75(a0) -; RVA22U64-PACK-NEXT: packh a4, a6, a4 -; RVA22U64-PACK-NEXT: packw a1, a1, a4 -; RVA22U64-PACK-NEXT: packh a3, a5, a3 -; RVA22U64-PACK-NEXT: packh a0, a2, a0 -; RVA22U64-PACK-NEXT: packw a0, a3, a0 -; RVA22U64-PACK-NEXT: pack a0, a1, a0 +; RVA22U64-PACK-NEXT: packh a3, a6, a7 +; RVA22U64-PACK-NEXT: packh a4, t0, a4 +; RVA22U64-PACK-NEXT: packh a1, a1, a2 +; RVA22U64-PACK-NEXT: packh a0, a5, a0 +; RVA22U64-PACK-NEXT: packw a2, a3, a4 +; RVA22U64-PACK-NEXT: packh a3, a0, a0 +; RVA22U64-PACK-NEXT: packw a3, a3, a3 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a0, a2, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a0 -; RVA22U64-PACK-NEXT: packh a0, a0, a0 -; RVA22U64-PACK-NEXT: packw a0, a0, a0 -; RVA22U64-PACK-NEXT: pack a0, a0, a0 +; RVA22U64-PACK-NEXT: pack a0, a3, a3 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2530,54 +2504,53 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32-ONLY-NEXT: lbu a3, 44(a0) ; RV32-ONLY-NEXT: lbu a4, 55(a0) ; RV32-ONLY-NEXT: lbu a5, 75(a0) +; RV32-ONLY-NEXT: li a6, 255 +; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, a6 ; RV32-ONLY-NEXT: lbu a6, 82(a0) ; RV32-ONLY-NEXT: lbu a7, 93(a0) ; RV32-ONLY-NEXT: lbu t0, 105(a0) ; RV32-ONLY-NEXT: lbu a0, 161(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a2 +; RV32-ONLY-NEXT: vmv.v.x v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV32-ONLY-NEXT: vslide1down.vx v9, v8, a5 -; RV32-ONLY-NEXT: vmv.v.x v8, a6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: li a0, 255 -; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, a0 -; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 4 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 4 +; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_undef_edges: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 55(a0) -; RV32VB-NEXT: lbu a2, 31(a0) -; RV32VB-NEXT: lbu a3, 44(a0) -; RV32VB-NEXT: lbu a4, 623(a0) -; RV32VB-NEXT: lbu a5, 75(a0) -; RV32VB-NEXT: slli a1, a1, 8 +; RV32VB-NEXT: lbu a1, 623(a0) +; RV32VB-NEXT: lbu a2, 55(a0) +; RV32VB-NEXT: lbu a3, 75(a0) +; RV32VB-NEXT: lbu a4, 31(a0) +; RV32VB-NEXT: lbu a5, 44(a0) +; RV32VB-NEXT: slli a2, a2, 8 +; RV32VB-NEXT: slli a1, a1, 16 +; RV32VB-NEXT: slli a3, a3, 24 +; RV32VB-NEXT: or a2, a5, a2 +; RV32VB-NEXT: lbu a5, 82(a0) +; RV32VB-NEXT: lbu a6, 93(a0) ; RV32VB-NEXT: or a1, a3, a1 -; RV32VB-NEXT: slli a4, a4, 16 -; RV32VB-NEXT: slli a5, a5, 24 -; RV32VB-NEXT: or a4, a5, a4 -; RV32VB-NEXT: lbu a3, 82(a0) -; RV32VB-NEXT: lbu a5, 93(a0) -; RV32VB-NEXT: or a1, a1, a4 -; RV32VB-NEXT: lbu a4, 105(a0) +; RV32VB-NEXT: lbu a3, 105(a0) ; RV32VB-NEXT: lbu a0, 161(a0) -; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a3, a3, a5 -; RV32VB-NEXT: slli a2, a2, 24 -; RV32VB-NEXT: slli a4, a4, 16 +; RV32VB-NEXT: slli a6, a6, 8 +; RV32VB-NEXT: or a5, a5, a6 +; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a4 -; RV32VB-NEXT: or a0, a3, a0 +; RV32VB-NEXT: or a0, a0, a3 +; RV32VB-NEXT: slli a4, a4, 24 +; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: or a0, a5, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-NEXT: vmv.v.x v8, a2 +; RV32VB-NEXT: vmv.v.x v8, a4 ; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: vslide1down.vx v8, v8, zero @@ -2590,18 +2563,18 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32VB-PACK-NEXT: lbu a3, 44(a0) ; RV32VB-PACK-NEXT: lbu a4, 55(a0) ; RV32VB-PACK-NEXT: lbu a5, 75(a0) -; RV32VB-PACK-NEXT: packh a2, a0, a2 +; RV32VB-PACK-NEXT: lbu a6, 82(a0) +; RV32VB-PACK-NEXT: lbu a7, 93(a0) +; RV32VB-PACK-NEXT: lbu t0, 105(a0) +; RV32VB-PACK-NEXT: lbu a0, 161(a0) ; RV32VB-PACK-NEXT: packh a3, a3, a4 ; RV32VB-PACK-NEXT: packh a1, a1, a5 -; RV32VB-PACK-NEXT: lbu a4, 82(a0) -; RV32VB-PACK-NEXT: lbu a5, 93(a0) -; RV32VB-PACK-NEXT: pack a1, a3, a1 -; RV32VB-PACK-NEXT: lbu a3, 105(a0) -; RV32VB-PACK-NEXT: lbu a0, 161(a0) -; RV32VB-PACK-NEXT: packh a4, a4, a5 +; RV32VB-PACK-NEXT: packh a4, a6, a7 +; RV32VB-PACK-NEXT: packh a0, t0, a0 ; RV32VB-PACK-NEXT: packh a5, a0, a0 +; RV32VB-PACK-NEXT: packh a2, a0, a2 ; RV32VB-PACK-NEXT: pack a2, a5, a2 -; RV32VB-PACK-NEXT: packh a0, a3, a0 +; RV32VB-PACK-NEXT: pack a1, a3, a1 ; RV32VB-PACK-NEXT: pack a0, a4, a0 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a2 @@ -2618,84 +2591,83 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64V-ONLY-NEXT: lbu a3, 44(a0) ; RV64V-ONLY-NEXT: lbu a4, 55(a0) ; RV64V-ONLY-NEXT: lbu a5, 75(a0) +; RV64V-ONLY-NEXT: li a6, 255 +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, a6 ; RV64V-ONLY-NEXT: lbu a6, 82(a0) ; RV64V-ONLY-NEXT: lbu a7, 93(a0) ; RV64V-ONLY-NEXT: lbu t0, 105(a0) ; RV64V-ONLY-NEXT: lbu a0, 161(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a2 +; RV64V-ONLY-NEXT: vmv.v.x v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, a5 -; RV64V-ONLY-NEXT: vmv.v.x v8, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: li a0, 255 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, a0 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 4 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 4 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_undef_edges: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 31(a0) +; RVA22U64-NEXT: lbu a6, 31(a0) ; RVA22U64-NEXT: lbu a2, 44(a0) ; RVA22U64-NEXT: lbu a3, 55(a0) ; RVA22U64-NEXT: lbu a4, 623(a0) ; RVA22U64-NEXT: lbu a5, 75(a0) ; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a3, a3, 40 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: slli a1, a1, 24 ; RVA22U64-NEXT: slli a4, a4, 48 ; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: or a4, a4, a5 -; RVA22U64-NEXT: or a2, a2, a4 +; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: lbu a3, 82(a0) -; RVA22U64-NEXT: lbu a4, 93(a0) -; RVA22U64-NEXT: add.uw a1, a1, a2 -; RVA22U64-NEXT: lbu a2, 105(a0) +; RVA22U64-NEXT: lbu a1, 93(a0) +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: lbu a5, 105(a0) ; RVA22U64-NEXT: lbu a0, 161(a0) -; RVA22U64-NEXT: slli a4, a4, 8 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: slli a2, a2, 16 +; RVA22U64-NEXT: slli a1, a1, 8 +; RVA22U64-NEXT: or a1, a1, a3 +; RVA22U64-NEXT: slli a5, a5, 16 ; RVA22U64-NEXT: slli a0, a0, 24 -; RVA22U64-NEXT: or a0, a0, a2 -; RVA22U64-NEXT: or a0, a0, a3 +; RVA22U64-NEXT: or a0, a0, a5 +; RVA22U64-NEXT: slli a6, a6, 24 +; RVA22U64-NEXT: or a2, a2, a4 +; RVA22U64-NEXT: add.uw a2, a6, a2 +; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a1 +; RVA22U64-NEXT: vmv.v.x v8, a2 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a1, 623(a0) -; RVA22U64-PACK-NEXT: lbu a2, 31(a0) -; RVA22U64-PACK-NEXT: lbu a3, 44(a0) +; RVA22U64-PACK-NEXT: lbu a7, 623(a0) +; RVA22U64-PACK-NEXT: lbu a6, 31(a0) +; RVA22U64-PACK-NEXT: lbu t0, 44(a0) ; RVA22U64-PACK-NEXT: lbu a4, 55(a0) ; RVA22U64-PACK-NEXT: lbu a5, 75(a0) -; RVA22U64-PACK-NEXT: packh a6, a0, a2 -; RVA22U64-PACK-NEXT: packh a2, a0, a0 -; RVA22U64-PACK-NEXT: packh a3, a3, a4 -; RVA22U64-PACK-NEXT: packh a1, a1, a5 -; RVA22U64-PACK-NEXT: packw a7, a3, a1 -; RVA22U64-PACK-NEXT: lbu a3, 82(a0) -; RVA22U64-PACK-NEXT: lbu a4, 93(a0) -; RVA22U64-PACK-NEXT: lbu a5, 105(a0) +; RVA22U64-PACK-NEXT: lbu a2, 82(a0) +; RVA22U64-PACK-NEXT: lbu a1, 93(a0) +; RVA22U64-PACK-NEXT: lbu a3, 105(a0) ; RVA22U64-PACK-NEXT: lbu a0, 161(a0) -; RVA22U64-PACK-NEXT: packw a1, a2, a6 -; RVA22U64-PACK-NEXT: pack a1, a1, a7 -; RVA22U64-PACK-NEXT: packh a3, a3, a4 -; RVA22U64-PACK-NEXT: packh a0, a5, a0 -; RVA22U64-PACK-NEXT: packw a0, a3, a0 +; RVA22U64-PACK-NEXT: packh a4, t0, a4 +; RVA22U64-PACK-NEXT: packh a5, a7, a5 +; RVA22U64-PACK-NEXT: packh a1, a2, a1 +; RVA22U64-PACK-NEXT: packh a0, a3, a0 +; RVA22U64-PACK-NEXT: packh a2, a0, a0 +; RVA22U64-PACK-NEXT: packh a3, a0, a6 +; RVA22U64-PACK-NEXT: packw a3, a2, a3 +; RVA22U64-PACK-NEXT: packw a2, a2, a2 +; RVA22U64-PACK-NEXT: packw a4, a4, a5 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a1, a3, a4 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 -; RVA22U64-PACK-NEXT: packw a1, a2, a2 -; RVA22U64-PACK-NEXT: pack a0, a0, a1 +; RVA22U64-PACK-NEXT: pack a0, a0, a2 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2706,26 +2678,25 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64ZVE32-NEXT: lbu a3, 44(a0) ; RV64ZVE32-NEXT: lbu a4, 55(a0) ; RV64ZVE32-NEXT: lbu a5, 75(a0) +; RV64ZVE32-NEXT: li a6, 255 +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a6 ; RV64ZVE32-NEXT: lbu a6, 82(a0) ; RV64ZVE32-NEXT: lbu a7, 93(a0) ; RV64ZVE32-NEXT: lbu t0, 105(a0) ; RV64ZVE32-NEXT: lbu a0, 161(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a2 +; RV64ZVE32-NEXT: vmv.v.x v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32-NEXT: vslide1down.vx v9, v8, a5 -; RV64ZVE32-NEXT: vmv.v.x v8, a6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: li a0, 255 -; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, a0 -; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 +; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 4 +; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ret %p4 = getelementptr i8, ptr %p, i32 31 %p5 = getelementptr i8, ptr %p, i32 44 @@ -2770,58 +2741,57 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-NEXT: lbu a6, 82(a0) ; RV32-ONLY-NEXT: lbu a7, 93(a0) ; RV32-ONLY-NEXT: lbu t0, 124(a0) +; RV32-ONLY-NEXT: li t1, 255 +; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, t1 ; RV32-ONLY-NEXT: lbu t1, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: vmv.v.x v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 +; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 2 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 1 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV32-ONLY-NEXT: vslide1down.vx v9, v8, a5 -; RV32-ONLY-NEXT: vmv.v.x v8, a6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t1 -; RV32-ONLY-NEXT: li a1, 255 -; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, a1 -; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t1 +; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 +; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV32VB: # %bb.0: ; RV32VB-NEXT: lbu a1, 1(a0) ; RV32VB-NEXT: lbu a2, 0(a0) -; RV32VB-NEXT: lbu a3, 55(a0) -; RV32VB-NEXT: lbu a4, 44(a0) +; RV32VB-NEXT: lbu a3, 44(a0) +; RV32VB-NEXT: lbu a4, 55(a0) ; RV32VB-NEXT: slli a1, a1, 8 ; RV32VB-NEXT: or a1, a2, a1 -; RV32VB-NEXT: slli a3, a3, 8 -; RV32VB-NEXT: or a3, a4, a3 ; RV32VB-NEXT: lbu a2, 75(a0) -; RV32VB-NEXT: lbu a4, 82(a0) -; RV32VB-NEXT: lbu a5, 93(a0) -; RV32VB-NEXT: lbu a6, 124(a0) -; RV32VB-NEXT: slli a2, a2, 24 -; RV32VB-NEXT: or a2, a3, a2 -; RV32VB-NEXT: lbu a3, 144(a0) +; RV32VB-NEXT: lbu a5, 82(a0) +; RV32VB-NEXT: lbu a6, 93(a0) +; RV32VB-NEXT: lbu a7, 124(a0) +; RV32VB-NEXT: slli a4, a4, 8 +; RV32VB-NEXT: or a3, a3, a4 +; RV32VB-NEXT: lbu a4, 144(a0) ; RV32VB-NEXT: lbu a0, 154(a0) -; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a4, a4, a5 -; RV32VB-NEXT: slli a3, a3, 16 +; RV32VB-NEXT: slli a6, a6, 8 +; RV32VB-NEXT: or a5, a5, a6 +; RV32VB-NEXT: slli a4, a4, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a3 -; RV32VB-NEXT: or a0, a6, a0 +; RV32VB-NEXT: or a0, a0, a4 +; RV32VB-NEXT: slli a2, a2, 24 +; RV32VB-NEXT: or a2, a3, a2 +; RV32VB-NEXT: or a0, a7, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-NEXT: vslide1down.vx v8, v8, a4 +; RV32VB-NEXT: vslide1down.vx v8, v8, a5 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -2831,26 +2801,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 44(a0) ; RV32VB-PACK-NEXT: lbu a4, 55(a0) +; RV32VB-PACK-NEXT: lbu a5, 75(a0) +; RV32VB-PACK-NEXT: lbu a6, 82(a0) +; RV32VB-PACK-NEXT: lbu a7, 93(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: packh a2, a3, a4 -; RV32VB-PACK-NEXT: lbu a3, 75(a0) -; RV32VB-PACK-NEXT: lbu a4, 82(a0) -; RV32VB-PACK-NEXT: lbu a5, 93(a0) -; RV32VB-PACK-NEXT: lbu a6, 124(a0) -; RV32VB-PACK-NEXT: lbu a7, 144(a0) -; RV32VB-PACK-NEXT: lbu a0, 154(a0) -; RV32VB-PACK-NEXT: packh a3, a0, a3 -; RV32VB-PACK-NEXT: pack a2, a2, a3 -; RV32VB-PACK-NEXT: packh a3, a4, a5 -; RV32VB-PACK-NEXT: packh a0, a7, a0 -; RV32VB-PACK-NEXT: packh a4, a6, a0 -; RV32VB-PACK-NEXT: pack a0, a4, a0 -; RV32VB-PACK-NEXT: packh a4, a0, a0 -; RV32VB-PACK-NEXT: pack a1, a1, a4 +; RV32VB-PACK-NEXT: lbu a2, 144(a0) +; RV32VB-PACK-NEXT: lbu t0, 154(a0) +; RV32VB-PACK-NEXT: packh a3, a3, a4 +; RV32VB-PACK-NEXT: lbu a0, 124(a0) +; RV32VB-PACK-NEXT: packh a4, a6, a7 +; RV32VB-PACK-NEXT: packh a2, a2, t0 +; RV32VB-PACK-NEXT: packh a5, a0, a5 +; RV32VB-PACK-NEXT: pack a3, a3, a5 +; RV32VB-PACK-NEXT: packh a5, a0, a0 +; RV32VB-PACK-NEXT: packh a0, a0, a0 +; RV32VB-PACK-NEXT: pack a0, a0, a2 +; RV32VB-PACK-NEXT: pack a1, a1, a5 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-PACK-NEXT: pack a1, a3, a4 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 +; RV32VB-PACK-NEXT: pack a1, a4, a5 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret @@ -2865,28 +2835,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64V-ONLY-NEXT: lbu a6, 82(a0) ; RV64V-ONLY-NEXT: lbu a7, 93(a0) ; RV64V-ONLY-NEXT: lbu t0, 124(a0) +; RV64V-ONLY-NEXT: li t1, 255 +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, t1 ; RV64V-ONLY-NEXT: lbu t1, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: vmv.v.x v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 +; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 2 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 +; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 1 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, a5 -; RV64V-ONLY-NEXT: vmv.v.x v8, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t1 -; RV64V-ONLY-NEXT: li a1, 255 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, a1 -; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t1 +; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2897,26 +2866,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RVA22U64-NEXT: lbu a4, 55(a0) ; RVA22U64-NEXT: slli a1, a1, 8 ; RVA22U64-NEXT: or a6, a2, a1 +; RVA22U64-NEXT: lbu a7, 75(a0) +; RVA22U64-NEXT: lbu a5, 82(a0) +; RVA22U64-NEXT: lbu a1, 93(a0) +; RVA22U64-NEXT: lbu a2, 124(a0) ; RVA22U64-NEXT: slli a3, a3, 32 ; RVA22U64-NEXT: slli a4, a4, 40 ; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: lbu a2, 75(a0) -; RVA22U64-NEXT: lbu a4, 82(a0) -; RVA22U64-NEXT: lbu a5, 93(a0) -; RVA22U64-NEXT: lbu a1, 124(a0) -; RVA22U64-NEXT: slli a2, a2, 56 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: or a2, a6, a2 -; RVA22U64-NEXT: lbu a3, 144(a0) +; RVA22U64-NEXT: lbu a4, 144(a0) ; RVA22U64-NEXT: lbu a0, 154(a0) -; RVA22U64-NEXT: slli a5, a5, 8 -; RVA22U64-NEXT: or a4, a4, a5 -; RVA22U64-NEXT: slli a3, a3, 48 +; RVA22U64-NEXT: slli a1, a1, 8 +; RVA22U64-NEXT: or a1, a1, a5 +; RVA22U64-NEXT: slli a4, a4, 48 ; RVA22U64-NEXT: slli a0, a0, 56 -; RVA22U64-NEXT: or a0, a0, a3 -; RVA22U64-NEXT: slli a1, a1, 32 -; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: or a0, a0, a4 +; RVA22U64-NEXT: slli a7, a7, 56 +; RVA22U64-NEXT: or a3, a7, a3 +; RVA22U64-NEXT: slli a2, a2, 32 +; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a2, a6, a3 +; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.v.x v8, a2 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 @@ -2926,26 +2895,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RVA22U64-PACK: # %bb.0: ; RVA22U64-PACK-NEXT: lbu a1, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a3, 44(a0) -; RVA22U64-PACK-NEXT: lbu a4, 55(a0) -; RVA22U64-PACK-NEXT: packh a6, a1, a2 -; RVA22U64-PACK-NEXT: packh a2, a3, a4 -; RVA22U64-PACK-NEXT: lbu a3, 75(a0) -; RVA22U64-PACK-NEXT: lbu a7, 82(a0) -; RVA22U64-PACK-NEXT: lbu a5, 93(a0) -; RVA22U64-PACK-NEXT: lbu t0, 124(a0) -; RVA22U64-PACK-NEXT: packh a3, a0, a3 -; RVA22U64-PACK-NEXT: packw a2, a2, a3 -; RVA22U64-PACK-NEXT: packh a3, a0, a0 -; RVA22U64-PACK-NEXT: lbu a4, 144(a0) -; RVA22U64-PACK-NEXT: lbu a0, 154(a0) -; RVA22U64-PACK-NEXT: packw a1, a6, a3 -; RVA22U64-PACK-NEXT: pack a1, a1, a2 -; RVA22U64-PACK-NEXT: packh a2, a7, a5 -; RVA22U64-PACK-NEXT: packh a0, a4, a0 -; RVA22U64-PACK-NEXT: packh a4, t0, a0 -; RVA22U64-PACK-NEXT: packw a0, a4, a0 -; RVA22U64-PACK-NEXT: packw a2, a2, a3 +; RVA22U64-PACK-NEXT: lbu a7, 44(a0) +; RVA22U64-PACK-NEXT: lbu t0, 55(a0) +; RVA22U64-PACK-NEXT: lbu a6, 75(a0) +; RVA22U64-PACK-NEXT: lbu a5, 82(a0) +; RVA22U64-PACK-NEXT: lbu a3, 93(a0) +; RVA22U64-PACK-NEXT: packh t1, a1, a2 +; RVA22U64-PACK-NEXT: lbu a2, 144(a0) +; RVA22U64-PACK-NEXT: lbu a4, 154(a0) +; RVA22U64-PACK-NEXT: packh a1, a7, t0 +; RVA22U64-PACK-NEXT: lbu a0, 124(a0) +; RVA22U64-PACK-NEXT: packh a3, a5, a3 +; RVA22U64-PACK-NEXT: packh a2, a2, a4 +; RVA22U64-PACK-NEXT: packh a4, a0, a6 +; RVA22U64-PACK-NEXT: packw a1, a1, a4 +; RVA22U64-PACK-NEXT: packh a4, a0, a0 +; RVA22U64-PACK-NEXT: packh a0, a0, a0 +; RVA22U64-PACK-NEXT: packw a5, t1, a4 +; RVA22U64-PACK-NEXT: packw a0, a0, a2 +; RVA22U64-PACK-NEXT: packw a2, a3, a4 +; RVA22U64-PACK-NEXT: pack a1, a5, a1 ; RVA22U64-PACK-NEXT: pack a0, a2, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 @@ -2962,28 +2931,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64ZVE32-NEXT: lbu a6, 82(a0) ; RV64ZVE32-NEXT: lbu a7, 93(a0) ; RV64ZVE32-NEXT: lbu t0, 124(a0) +; RV64ZVE32-NEXT: li t1, 255 +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, t1 ; RV64ZVE32-NEXT: lbu t1, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: vmv.v.x v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 2 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32-NEXT: vslide1down.vx v9, v8, a5 -; RV64ZVE32-NEXT: vmv.v.x v8, a6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t1 -; RV64ZVE32-NEXT: li a1, 255 -; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, a1 -; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t1 +; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 +; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 @@ -3042,91 +3010,91 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV32-ONLY: # %bb.0: ; RV32-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a0 +; RV32-ONLY-NEXT: vmv.v.x v9, a4 +; RV32-ONLY-NEXT: vmv.v.i v0, 15 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV32-ONLY-NEXT: vslide1down.vx v9, v8, a3 -; RV32-ONLY-NEXT: vmv.v.x v8, a4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV32-ONLY-NEXT: vmv.v.i v0, 15 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a3 +; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a7 +; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v8i8_pack: ; RV32VB: # %bb.0: ; RV32VB-NEXT: slli a7, a7, 24 ; RV32VB-NEXT: andi a6, a6, 255 -; RV32VB-NEXT: slli a6, a6, 16 -; RV32VB-NEXT: or a6, a7, a6 ; RV32VB-NEXT: andi a4, a4, 255 ; RV32VB-NEXT: andi a5, a5, 255 -; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a4, a4, a5 -; RV32VB-NEXT: or a4, a4, a6 ; RV32VB-NEXT: slli a3, a3, 24 ; RV32VB-NEXT: andi a2, a2, 255 -; RV32VB-NEXT: slli a2, a2, 16 -; RV32VB-NEXT: or a2, a3, a2 ; RV32VB-NEXT: andi a0, a0, 255 ; RV32VB-NEXT: andi a1, a1, 255 +; RV32VB-NEXT: slli a6, a6, 16 +; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: slli a2, a2, 16 ; RV32VB-NEXT: slli a1, a1, 8 +; RV32VB-NEXT: or a6, a7, a6 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: or a2, a3, a2 ; RV32VB-NEXT: or a0, a0, a1 +; RV32VB-NEXT: or a1, a4, a6 ; RV32VB-NEXT: or a0, a0, a2 ; RV32VB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a0 -; RV32VB-NEXT: vslide1down.vx v8, v8, a4 +; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: ret ; ; RV32VB-PACK-LABEL: buildvec_v8i8_pack: ; RV32VB-PACK: # %bb.0: ; RV32VB-PACK-NEXT: packh a6, a6, a7 ; RV32VB-PACK-NEXT: packh a4, a4, a5 -; RV32VB-PACK-NEXT: pack a4, a4, a6 ; RV32VB-PACK-NEXT: packh a2, a2, a3 ; RV32VB-PACK-NEXT: packh a0, a0, a1 +; RV32VB-PACK-NEXT: pack a1, a4, a6 ; RV32VB-PACK-NEXT: pack a0, a0, a2 ; RV32VB-PACK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a0 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a4 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-PACK-NEXT: ret ; ; RV64V-ONLY-LABEL: buildvec_v8i8_pack: ; RV64V-ONLY: # %bb.0: ; RV64V-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a0 +; RV64V-ONLY-NEXT: vmv.v.x v9, a4 +; RV64V-ONLY-NEXT: vmv.v.i v0, 15 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v8, a3 -; RV64V-ONLY-NEXT: vmv.v.x v8, a4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV64V-ONLY-NEXT: vmv.v.i v0, 15 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a7 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v8i8_pack: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: andi a4, a4, 255 -; RVA22U64-NEXT: slli a4, a4, 32 +; RVA22U64-NEXT: andi t0, a4, 255 ; RVA22U64-NEXT: andi a5, a5, 255 -; RVA22U64-NEXT: slli a5, a5, 40 -; RVA22U64-NEXT: or a4, a4, a5 ; RVA22U64-NEXT: slli a7, a7, 56 -; RVA22U64-NEXT: andi a5, a6, 255 -; RVA22U64-NEXT: slli a5, a5, 48 -; RVA22U64-NEXT: or a5, a7, a5 -; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: andi a4, a6, 255 ; RVA22U64-NEXT: andi a2, a2, 255 -; RVA22U64-NEXT: slli a2, a2, 16 ; RVA22U64-NEXT: andi a3, a3, 255 -; RVA22U64-NEXT: slli a3, a3, 24 -; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: andi a0, a0, 255 ; RVA22U64-NEXT: andi a1, a1, 255 +; RVA22U64-NEXT: slli t0, t0, 32 +; RVA22U64-NEXT: slli a5, a5, 40 +; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: slli a2, a2, 16 +; RVA22U64-NEXT: slli a3, a3, 24 ; RVA22U64-NEXT: slli a1, a1, 8 +; RVA22U64-NEXT: or a5, a5, t0 +; RVA22U64-NEXT: or a4, a7, a4 +; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a4, a4, a5 ; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -3137,11 +3105,11 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RVA22U64-PACK: # %bb.0: ; RVA22U64-PACK-NEXT: packh a6, a6, a7 ; RVA22U64-PACK-NEXT: packh a4, a4, a5 -; RVA22U64-PACK-NEXT: packw a4, a4, a6 ; RVA22U64-PACK-NEXT: packh a2, a2, a3 ; RVA22U64-PACK-NEXT: packh a0, a0, a1 +; RVA22U64-PACK-NEXT: packw a1, a4, a6 ; RVA22U64-PACK-NEXT: packw a0, a0, a2 -; RVA22U64-PACK-NEXT: pack a0, a0, a4 +; RVA22U64-PACK-NEXT: pack a0, a0, a1 ; RVA22U64-PACK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.s.x v8, a0 ; RVA22U64-PACK-NEXT: ret @@ -3150,15 +3118,15 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV64ZVE32: # %bb.0: ; RV64ZVE32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a0 +; RV64ZVE32-NEXT: vmv.v.x v9, a4 +; RV64ZVE32-NEXT: vmv.v.i v0, 15 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32-NEXT: vslide1down.vx v9, v8, a3 -; RV64ZVE32-NEXT: vmv.v.x v8, a4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32-NEXT: vmv.v.i v0, 15 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a3 +; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a7 +; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32-NEXT: ret %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0 %v2 = insertelement <8 x i8> %v1, i8 %e2, i32 1 @@ -3188,32 +3156,32 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV32VB: # %bb.0: ; RV32VB-NEXT: slli a3, a3, 24 ; RV32VB-NEXT: andi a2, a2, 255 -; RV32VB-NEXT: slli a2, a2, 16 -; RV32VB-NEXT: or a2, a3, a2 ; RV32VB-NEXT: andi a0, a0, 255 ; RV32VB-NEXT: andi a1, a1, 255 +; RV32VB-NEXT: andi a4, a4, 255 +; RV32VB-NEXT: andi a5, a5, 255 +; RV32VB-NEXT: slli a2, a2, 16 ; RV32VB-NEXT: slli a1, a1, 8 +; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: or a2, a3, a2 ; RV32VB-NEXT: or a0, a0, a1 ; RV32VB-NEXT: or a0, a0, a2 -; RV32VB-NEXT: andi a1, a4, 255 -; RV32VB-NEXT: andi a2, a5, 255 -; RV32VB-NEXT: slli a2, a2, 8 -; RV32VB-NEXT: or a1, a1, a2 +; RV32VB-NEXT: or a4, a4, a5 ; RV32VB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a0 -; RV32VB-NEXT: vslide1down.vx v8, v8, a1 +; RV32VB-NEXT: vslide1down.vx v8, v8, a4 ; RV32VB-NEXT: ret ; ; RV32VB-PACK-LABEL: buildvec_v6i8_pack: ; RV32VB-PACK: # %bb.0: ; RV32VB-PACK-NEXT: packh a2, a2, a3 ; RV32VB-PACK-NEXT: packh a0, a0, a1 -; RV32VB-PACK-NEXT: pack a0, a0, a2 ; RV32VB-PACK-NEXT: packh a1, a4, a5 +; RV32VB-PACK-NEXT: packh a3, a0, a0 +; RV32VB-PACK-NEXT: pack a0, a0, a2 ; RV32VB-PACK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a0 -; RV32VB-PACK-NEXT: packh a0, a0, a0 -; RV32VB-PACK-NEXT: pack a0, a1, a0 +; RV32VB-PACK-NEXT: pack a0, a1, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret ; @@ -3232,21 +3200,21 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RVA22U64-LABEL: buildvec_v6i8_pack: ; RVA22U64: # %bb.0: ; RVA22U64-NEXT: andi a2, a2, 255 -; RVA22U64-NEXT: slli a2, a2, 16 ; RVA22U64-NEXT: andi a3, a3, 255 -; RVA22U64-NEXT: slli a3, a3, 24 -; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: andi a0, a0, 255 ; RVA22U64-NEXT: andi a1, a1, 255 +; RVA22U64-NEXT: andi a4, a4, 255 +; RVA22U64-NEXT: andi a5, a5, 255 +; RVA22U64-NEXT: slli a2, a2, 16 +; RVA22U64-NEXT: slli a3, a3, 24 ; RVA22U64-NEXT: slli a1, a1, 8 +; RVA22U64-NEXT: slli a4, a4, 32 +; RVA22U64-NEXT: slli a5, a5, 40 +; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: or a0, a0, a2 -; RVA22U64-NEXT: andi a1, a4, 255 -; RVA22U64-NEXT: slli a1, a1, 32 -; RVA22U64-NEXT: andi a2, a5, 255 -; RVA22U64-NEXT: slli a2, a2, 40 -; RVA22U64-NEXT: or a1, a1, a2 -; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.s.x v8, a0 ; RVA22U64-NEXT: ret @@ -3255,10 +3223,10 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RVA22U64-PACK: # %bb.0: ; RVA22U64-PACK-NEXT: packh a2, a2, a3 ; RVA22U64-PACK-NEXT: packh a0, a0, a1 -; RVA22U64-PACK-NEXT: packw a0, a0, a2 ; RVA22U64-PACK-NEXT: packh a1, a4, a5 -; RVA22U64-PACK-NEXT: packh a2, a0, a0 -; RVA22U64-PACK-NEXT: packw a1, a1, a2 +; RVA22U64-PACK-NEXT: packh a3, a0, a0 +; RVA22U64-PACK-NEXT: packw a0, a0, a2 +; RVA22U64-PACK-NEXT: packw a1, a1, a3 ; RVA22U64-PACK-NEXT: pack a0, a0, a1 ; RVA22U64-PACK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.s.x v8, a0 @@ -3298,9 +3266,9 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) { ; RV32VB: # %bb.0: ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: zext.h a2, a2 -; RV32VB-NEXT: or a2, a2, a3 ; RV32VB-NEXT: slli a1, a1, 16 ; RV32VB-NEXT: zext.h a0, a0 +; RV32VB-NEXT: or a2, a2, a3 ; RV32VB-NEXT: or a0, a0, a1 ; RV32VB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a0 @@ -3329,11 +3297,11 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) { ; RVA22U64: # %bb.0: ; RVA22U64-NEXT: slli a3, a3, 48 ; RVA22U64-NEXT: zext.h a2, a2 -; RVA22U64-NEXT: slli a2, a2, 32 -; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: zext.h a0, a0 ; RVA22U64-NEXT: zext.h a1, a1 +; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a1, a1, 16 +; RVA22U64-NEXT: or a2, a2, a3 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -3455,3 +3423,5 @@ define <4 x i1> @buildvec_i1_splat(i1 %e1) { ret <4 x i1> %v4 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll index 6cab1bc218528..a25014295f9e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -63,8 +63,8 @@ define i8 @explode_8xi8(<8 x i8> %v) { ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a6, v8 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a6, a0 ; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: add a0, a6, a0 ; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a0, a0, a5 @@ -124,17 +124,17 @@ define i8 @explode_16xi8(<16 x i8> %v) { ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s t6, v8 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, t6, a0 ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a5, a5, a6 -; CHECK-NEXT: add a5, a5, a7 -; CHECK-NEXT: add a5, a5, t0 -; CHECK-NEXT: add a0, a0, a5 ; CHECK-NEXT: add t1, t1, t2 +; CHECK-NEXT: add a0, t6, a0 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a5, a5, a7 ; CHECK-NEXT: add t1, t1, t3 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a5, a5, t0 ; CHECK-NEXT: add t1, t1, t4 +; CHECK-NEXT: add a0, a0, a5 ; CHECK-NEXT: add t1, t1, t5 ; CHECK-NEXT: add a0, a0, t1 ; CHECK-NEXT: ret @@ -233,8 +233,8 @@ define i16 @explode_8xi16(<8 x i16> %v) { ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a6, v8 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a6, a0 ; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: add a0, a6, a0 ; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a0, a0, a5 @@ -260,54 +260,54 @@ define i16 @explode_8xi16(<8 x i16> %v) { define i16 @explode_16xi16(<16 x i16> %v) { ; CHECK-LABEL: explode_16xi16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 9 ; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 10 ; CHECK-NEXT: vmv.x.s a2, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 +; CHECK-NEXT: vslidedown.vi v10, v8, 11 ; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vslidedown.vi v10, v8, 12 ; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vslidedown.vi v10, v8, 13 ; CHECK-NEXT: vmv.x.s a5, v10 -; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 8 +; CHECK-NEXT: vslidedown.vi v10, v8, 14 ; CHECK-NEXT: vmv.x.s a6, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 9 +; CHECK-NEXT: vslidedown.vi v10, v8, 15 ; CHECK-NEXT: vmv.x.s a7, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 10 -; CHECK-NEXT: vmv.x.s t0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 11 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vmv.x.s t0, v9 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmv.x.s t1, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 12 -; CHECK-NEXT: vmv.x.s t2, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 13 +; CHECK-NEXT: vslidedown.vi v10, v8, 5 +; CHECK-NEXT: vmv.x.s t2, v9 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 ; CHECK-NEXT: vmv.x.s t3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 14 -; CHECK-NEXT: vmv.x.s t4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 15 -; CHECK-NEXT: vmv.x.s t5, v10 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vmv.x.s t4, v9 ; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vmv.x.s t5, v10 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s t6, v8 +; CHECK-NEXT: add t0, t0, t1 +; CHECK-NEXT: add t2, t2, t3 +; CHECK-NEXT: add a0, t5, a0 +; CHECK-NEXT: add a3, a3, a4 +; CHECK-NEXT: add t0, t6, t0 +; CHECK-NEXT: add t2, t2, t4 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, t6, a0 -; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a3, a3, a5 +; CHECK-NEXT: add t0, t0, t2 ; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: add a5, a5, a6 -; CHECK-NEXT: add a5, a5, a7 -; CHECK-NEXT: add a5, a5, t0 -; CHECK-NEXT: add a0, a0, a5 -; CHECK-NEXT: add t1, t1, t2 -; CHECK-NEXT: add t1, t1, t3 -; CHECK-NEXT: add t1, t1, t4 -; CHECK-NEXT: add t1, t1, t5 -; CHECK-NEXT: add a0, a0, t1 +; CHECK-NEXT: add a3, a3, a6 +; CHECK-NEXT: add a0, t0, a0 +; CHECK-NEXT: add a3, a3, a7 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: ret %e0 = extractelement <16 x i16> %v, i32 0 %e1 = extractelement <16 x i16> %v, i32 1 @@ -401,58 +401,58 @@ define i32 @explode_4xi32(<4 x i32> %v) { define i32 @explode_8xi32(<8 x i32> %v) { ; RV32-LABEL: explode_8xi32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 4 -; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: vmv.x.s a3, v10 +; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 6 -; RV32-NEXT: vmv.x.s a4, v10 +; RV32-NEXT: vmv.x.s a2, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 7 -; RV32-NEXT: vmv.x.s a5, v10 +; RV32-NEXT: vmv.x.s a3, v10 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vmv.x.s a4, v9 ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vmv.x.s a5, v10 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a6, v8 +; RV32-NEXT: add a4, a4, a5 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a6, a0 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: add a4, a6, a4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a0, a0, a5 +; RV32-NEXT: add a0, a4, a0 +; RV32-NEXT: add a0, a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_8xi32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 4 -; RV64-NEXT: vmv.x.s a2, v10 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 5 -; RV64-NEXT: vmv.x.s a3, v10 +; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 6 -; RV64-NEXT: vmv.x.s a4, v10 +; RV64-NEXT: vmv.x.s a2, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 7 -; RV64-NEXT: vmv.x.s a5, v10 +; RV64-NEXT: vmv.x.s a3, v10 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vslidedown.vi v10, v8, 3 +; RV64-NEXT: vmv.x.s a4, v9 ; RV64-NEXT: vmv.s.x v9, zero +; RV64-NEXT: vmv.x.s a5, v10 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vredxor.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s a6, v8 +; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, a6, a0 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: add a4, a6, a4 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: addw a0, a0, a5 +; RV64-NEXT: add a0, a4, a0 +; RV64-NEXT: addw a0, a0, a3 ; RV64-NEXT: ret %e0 = extractelement <8 x i32> %v, i32 0 %e1 = extractelement <8 x i32> %v, i32 1 @@ -484,24 +484,27 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV32-NEXT: addi s0, sp, 128 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 2 -; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: vmv.x.s a1, v12 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v12, v8, 4 -; RV32-NEXT: vmv.x.s a2, v12 +; RV32-NEXT: vmv.x.s a0, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 5 -; RV32-NEXT: vmv.x.s a3, v12 +; RV32-NEXT: vmv.x.s a1, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 6 -; RV32-NEXT: vmv.x.s a4, v12 +; RV32-NEXT: vmv.x.s a2, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 7 +; RV32-NEXT: vmv.x.s a3, v12 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: vslidedown.vi v13, v8, 3 +; RV32-NEXT: mv a4, sp ; RV32-NEXT: vmv.x.s a5, v12 -; RV32-NEXT: mv a6, sp +; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: vmv.x.s a6, v13 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a6) -; RV32-NEXT: lw a6, 32(sp) +; RV32-NEXT: vse32.v v8, (a4) +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vredxor.vs v8, v8, v12 +; RV32-NEXT: lw a4, 32(sp) ; RV32-NEXT: lw a7, 36(sp) ; RV32-NEXT: lw t0, 40(sp) ; RV32-NEXT: lw t1, 44(sp) @@ -509,22 +512,19 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV32-NEXT: lw t3, 52(sp) ; RV32-NEXT: lw t4, 56(sp) ; RV32-NEXT: lw t5, 60(sp) -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s t6, v8 +; RV32-NEXT: add a5, a5, a6 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, t6, a0 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: add a5, t6, a5 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a0, a0, a5 +; RV32-NEXT: add a0, a5, a0 +; RV32-NEXT: add a3, a3, a4 ; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a7, a7, t1 -; RV32-NEXT: add a0, a0, a7 ; RV32-NEXT: add t2, t2, t3 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a7, a7, t1 ; RV32-NEXT: add t2, t2, t4 +; RV32-NEXT: add a0, a0, a7 ; RV32-NEXT: add t2, t2, t5 ; RV32-NEXT: add a0, a0, t2 ; RV32-NEXT: addi sp, s0, -128 @@ -548,24 +548,27 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV64-NEXT: addi s0, sp, 128 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 2 -; RV64-NEXT: vmv.x.s a0, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vmv.x.s a1, v12 ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 4 -; RV64-NEXT: vmv.x.s a2, v12 +; RV64-NEXT: vmv.x.s a0, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 5 -; RV64-NEXT: vmv.x.s a3, v12 +; RV64-NEXT: vmv.x.s a1, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 6 -; RV64-NEXT: vmv.x.s a4, v12 +; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 7 +; RV64-NEXT: vmv.x.s a3, v12 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-NEXT: vslidedown.vi v13, v8, 3 +; RV64-NEXT: mv a4, sp ; RV64-NEXT: vmv.x.s a5, v12 -; RV64-NEXT: mv a6, sp +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vmv.x.s a6, v13 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (a6) -; RV64-NEXT: lw a6, 32(sp) +; RV64-NEXT: vse32.v v8, (a4) +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vredxor.vs v8, v8, v12 +; RV64-NEXT: lw a4, 32(sp) ; RV64-NEXT: lw a7, 36(sp) ; RV64-NEXT: lw t0, 40(sp) ; RV64-NEXT: lw t1, 44(sp) @@ -573,22 +576,19 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV64-NEXT: lw t3, 52(sp) ; RV64-NEXT: lw t4, 56(sp) ; RV64-NEXT: lw t5, 60(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s t6, v8 +; RV64-NEXT: add a5, a5, a6 ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, t6, a0 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: add a5, t6, a5 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: add a0, a0, a5 +; RV64-NEXT: add a0, a5, a0 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: add a7, a7, t0 -; RV64-NEXT: add a7, a7, t1 -; RV64-NEXT: add a0, a0, a7 ; RV64-NEXT: add t2, t2, t3 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: add a7, a7, t1 ; RV64-NEXT: add t2, t2, t4 +; RV64-NEXT: add a0, a0, a7 ; RV64-NEXT: add t2, t2, t5 ; RV64-NEXT: addw a0, a0, t2 ; RV64-NEXT: addi sp, s0, -128 @@ -639,9 +639,9 @@ define i64 @explode_2xi64(<2 x i64> %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -666,28 +666,29 @@ define i64 @explode_4xi64(<4 x i64> %v) { ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v12, v10, a0 -; RV32-NEXT: vmv.x.s a1, v12 -; RV32-NEXT: vmv.x.s a2, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vsrl.vx v12, v10, a0 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: vmv.x.s a4, v10 -; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vmv.s.x v12, zero ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 +; RV32-NEXT: vredxor.vs v12, v8, v12 +; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vsrl.vx v10, v10, a0 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: vmv.x.s a3, v10 +; RV32-NEXT: vmv.x.s a4, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: vsrl.vx v8, v12, a0 +; RV32-NEXT: vmv.x.s a0, v12 ; RV32-NEXT: vmv.x.s a5, v8 -; RV32-NEXT: add a2, a5, a2 -; RV32-NEXT: sltu a5, a2, a5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, a5 -; RV32-NEXT: add a1, a0, a3 -; RV32-NEXT: add a0, a2, a4 -; RV32-NEXT: sltu a2, a0, a2 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a6, a1, a0 +; RV32-NEXT: add a3, a5, a3 +; RV32-NEXT: add a0, a1, a2 +; RV32-NEXT: add a3, a3, a6 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: sltu a1, a0, a1 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_4xi64: @@ -721,59 +722,60 @@ define i64 @explode_8xi64(<8 x i64> %v) { ; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma ; RV32-NEXT: vslidedown.vi v12, v8, 2 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s a1, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 3 ; RV32-NEXT: vmv.x.s a2, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s a3, v16 -; RV32-NEXT: vmv.x.s a4, v12 +; RV32-NEXT: vsrl.vx v12, v12, a0 +; RV32-NEXT: vmv.x.s a1, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 4 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s a5, v16 +; RV32-NEXT: vmv.x.s a4, v16 +; RV32-NEXT: vsrl.vx v16, v16, a0 +; RV32-NEXT: vmv.x.s a3, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vmv.x.s a5, v12 +; RV32-NEXT: vsrl.vx v12, v12, a0 ; RV32-NEXT: vmv.x.s a6, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 5 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s a7, v16 -; RV32-NEXT: vmv.x.s t0, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 6 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s t1, v16 -; RV32-NEXT: vmv.x.s t2, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 7 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s t3, v16 -; RV32-NEXT: vmv.x.s t4, v12 -; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vmv.x.s a7, v16 +; RV32-NEXT: vsrl.vx v16, v16, a0 +; RV32-NEXT: vmv.x.s t0, v16 +; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 +; RV32-NEXT: vredxor.vs v16, v8, v16 +; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 7 +; RV32-NEXT: vmv.x.s t1, v12 +; RV32-NEXT: vsrl.vx v12, v12, a0 +; RV32-NEXT: vmv.x.s t2, v8 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: vmv.x.s t3, v12 +; RV32-NEXT: vmv.x.s t4, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: vsrl.vx v8, v16, a0 +; RV32-NEXT: vmv.x.s a0, v16 ; RV32-NEXT: vmv.x.s t5, v8 -; RV32-NEXT: add a2, t5, a2 -; RV32-NEXT: sltu t5, a2, t5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t5 -; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a1, t5, a1 ; RV32-NEXT: add a4, a2, a4 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: sltu a1, a4, a2 -; RV32-NEXT: add a1, a1, a5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a6, a4, a6 -; RV32-NEXT: sltu a1, a6, a4 -; RV32-NEXT: add a1, a1, a7 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t0, a6, t0 -; RV32-NEXT: sltu a1, t0, a6 -; RV32-NEXT: add a1, a1, t1 +; RV32-NEXT: add a5, a4, a5 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a1, a1, a6 +; RV32-NEXT: sltu a2, a5, a4 +; RV32-NEXT: add a7, a5, a7 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t2, t0, t2 -; RV32-NEXT: sltu a1, t2, t0 +; RV32-NEXT: add a2, a2, t0 +; RV32-NEXT: sltu a1, a7, a5 +; RV32-NEXT: add t1, a7, t1 +; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: add a1, a1, t3 -; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: add a0, t2, t4 -; RV32-NEXT: sltu a2, a0, t2 +; RV32-NEXT: sltu a3, t1, a7 +; RV32-NEXT: add a0, t1, t2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a3, a3, t4 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a2, a0, t1 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: ret ; @@ -792,21 +794,21 @@ define i64 @explode_8xi64(<8 x i64> %v) { ; RV64-NEXT: vslidedown.vi v12, v8, 2 ; RV64-NEXT: vmv.x.s a0, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vmv.x.s a1, v12 -; RV64-NEXT: mv a2, sp +; RV64-NEXT: mv a1, sp +; RV64-NEXT: vmv.x.s a2, v12 +; RV64-NEXT: vmv.s.x v12, zero ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 32(sp) +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vredxor.vs v8, v8, v12 +; RV64-NEXT: ld a1, 32(sp) ; RV64-NEXT: ld a3, 40(sp) ; RV64-NEXT: ld a4, 48(sp) ; RV64-NEXT: ld a5, 56(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s a6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, a6, a0 ; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a0, a6, a0 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: add a0, a0, a3 ; RV64-NEXT: add a0, a0, a5 @@ -840,20 +842,20 @@ define i64 @explode_8xi64(<8 x i64> %v) { define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-LABEL: explode_16xi64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 4(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 0(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: .cfi_def_cfa_offset 64 +; RV32-NEXT: sw s0, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: .cfi_offset s2, -12 @@ -866,121 +868,129 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: .cfi_offset s9, -40 ; RV32-NEXT: .cfi_offset s10, -44 ; RV32-NEXT: .cfi_offset s11, -48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 8 * vlenb ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 2 +; RV32-NEXT: vslidedown.vi v24, v8, 2 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 3 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a3, v24 -; RV32-NEXT: vmv.x.s a4, v16 +; RV32-NEXT: vslidedown.vi v0, v8, 3 ; RV32-NEXT: vslidedown.vi v16, v8, 4 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a5, v24 +; RV32-NEXT: vmv.x.s a1, v24 +; RV32-NEXT: vsrl.vx v24, v24, a0 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: vslidedown.vi v24, v8, 5 +; RV32-NEXT: vmv.x.s a3, v0 +; RV32-NEXT: vsrl.vx v0, v0, a0 +; RV32-NEXT: vmv.x.s a4, v0 +; RV32-NEXT: vslidedown.vi v0, v8, 6 +; RV32-NEXT: vmv.x.s a5, v16 +; RV32-NEXT: vsrl.vx v16, v16, a0 ; RV32-NEXT: vmv.x.s a6, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 5 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a7, v24 -; RV32-NEXT: vmv.x.s t0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 6 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s t1, v24 -; RV32-NEXT: vmv.x.s t2, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 7 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s t3, v24 +; RV32-NEXT: vmv.x.s a7, v24 +; RV32-NEXT: vsrl.vx v24, v24, a0 +; RV32-NEXT: vmv.x.s t0, v24 +; RV32-NEXT: vslidedown.vi v24, v8, 8 +; RV32-NEXT: vmv.x.s t1, v0 +; RV32-NEXT: vsrl.vx v0, v0, a0 +; RV32-NEXT: vmv.x.s t2, v0 +; RV32-NEXT: vslidedown.vi v0, v8, 9 +; RV32-NEXT: vmv.x.s t3, v16 +; RV32-NEXT: vsrl.vx v16, v16, a0 ; RV32-NEXT: vmv.x.s t4, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 8 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s t5, v24 -; RV32-NEXT: vmv.x.s t6, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 9 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s0, v24 -; RV32-NEXT: vmv.x.s s1, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 10 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s2, v24 +; RV32-NEXT: vmv.x.s t5, v24 +; RV32-NEXT: vsrl.vx v24, v24, a0 +; RV32-NEXT: vmv.x.s t6, v24 +; RV32-NEXT: vslidedown.vi v24, v8, 11 +; RV32-NEXT: vmv.x.s s0, v0 +; RV32-NEXT: vsrl.vx v0, v0, a0 +; RV32-NEXT: vmv.x.s s1, v0 +; RV32-NEXT: vslidedown.vi v0, v8, 12 +; RV32-NEXT: vmv.x.s s2, v16 +; RV32-NEXT: vsrl.vx v16, v16, a0 ; RV32-NEXT: vmv.x.s s3, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 11 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s4, v24 -; RV32-NEXT: vmv.x.s s5, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 12 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s6, v24 -; RV32-NEXT: vmv.x.s s7, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 13 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s9, v24 -; RV32-NEXT: vmv.x.s s8, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 14 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.s.x v17, zero +; RV32-NEXT: addi s4, sp, 16 +; RV32-NEXT: vs8r.v v16, (s4) # Unknown-size Folded Spill +; RV32-NEXT: vmv.x.s s4, v24 +; RV32-NEXT: vsrl.vx v24, v24, a0 +; RV32-NEXT: vmv.x.s s5, v24 +; RV32-NEXT: vslidedown.vi v24, v8, 14 +; RV32-NEXT: vmv.x.s s6, v0 +; RV32-NEXT: vsrl.vx v0, v0, a0 +; RV32-NEXT: vmv.x.s s7, v0 +; RV32-NEXT: vmv.s.x v7, zero ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v17, v8, v17 +; RV32-NEXT: vredxor.vs v16, v8, v7 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 15 +; RV32-NEXT: addi s8, sp, 16 +; RV32-NEXT: vl8r.v v0, (s8) # Unknown-size Folded Reload +; RV32-NEXT: vmv.x.s s8, v0 +; RV32-NEXT: vsrl.vx v0, v0, a0 +; RV32-NEXT: vmv.x.s s9, v0 +; RV32-NEXT: vsrl.vx v0, v24, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v18, v17, a0 -; RV32-NEXT: vmv.x.s s10, v18 +; RV32-NEXT: vsrl.vx v17, v16, a0 +; RV32-NEXT: vmv.x.s s10, v16 ; RV32-NEXT: vmv.x.s s11, v17 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v0, v8, a0 -; RV32-NEXT: add a1, s10, a1 +; RV32-NEXT: vsrl.vx v16, v8, a0 ; RV32-NEXT: add a2, s11, a2 -; RV32-NEXT: sltu a0, a2, s11 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: add a0, a0, a3 -; RV32-NEXT: add a4, a2, a4 -; RV32-NEXT: sltu a1, a4, a2 -; RV32-NEXT: add a1, a1, a5 +; RV32-NEXT: add a1, s10, a1 +; RV32-NEXT: sltu a0, a1, s10 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: add a3, a1, a3 +; RV32-NEXT: sltu a1, a3, a1 +; RV32-NEXT: add a1, a1, a6 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a6, a4, a6 -; RV32-NEXT: sltu a1, a6, a4 -; RV32-NEXT: add a1, a1, a7 +; RV32-NEXT: add a5, a3, a5 +; RV32-NEXT: sltu a1, a5, a3 +; RV32-NEXT: add a1, a1, t0 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t0, a6, t0 -; RV32-NEXT: sltu a1, t0, a6 -; RV32-NEXT: add a1, a1, t1 +; RV32-NEXT: add a7, a5, a7 +; RV32-NEXT: sltu a1, a7, a5 +; RV32-NEXT: add a1, a1, t2 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t2, t0, t2 -; RV32-NEXT: sltu a1, t2, t0 -; RV32-NEXT: add a1, a1, t3 +; RV32-NEXT: add t1, a7, t1 +; RV32-NEXT: sltu a1, t1, a7 +; RV32-NEXT: add a1, a1, t4 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t4, t2, t4 -; RV32-NEXT: sltu a1, t4, t2 -; RV32-NEXT: add a1, a1, t5 +; RV32-NEXT: add t3, t1, t3 +; RV32-NEXT: sltu a1, t3, t1 +; RV32-NEXT: add a1, a1, t6 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t6, t4, t6 -; RV32-NEXT: sltu a1, t6, t4 -; RV32-NEXT: add a1, a1, s0 +; RV32-NEXT: add t5, t3, t5 +; RV32-NEXT: sltu a1, t5, t3 +; RV32-NEXT: add a1, a1, s1 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s1, t6, s1 -; RV32-NEXT: sltu a1, s1, t6 -; RV32-NEXT: add a1, a1, s2 +; RV32-NEXT: add s0, t5, s0 +; RV32-NEXT: sltu a1, s0, t5 +; RV32-NEXT: add a1, a1, s3 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s3, s1, s3 -; RV32-NEXT: sltu a1, s3, s1 -; RV32-NEXT: add a1, a1, s4 +; RV32-NEXT: add s2, s0, s2 +; RV32-NEXT: sltu a1, s2, s0 +; RV32-NEXT: add a1, a1, s5 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s5, s3, s5 -; RV32-NEXT: sltu a1, s5, s3 -; RV32-NEXT: add a1, a1, s6 +; RV32-NEXT: add s4, s2, s4 +; RV32-NEXT: sltu a1, s4, s2 +; RV32-NEXT: add a1, a1, s7 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s7, s5, s7 -; RV32-NEXT: sltu a1, s7, s5 +; RV32-NEXT: add s6, s4, s6 +; RV32-NEXT: sltu a1, s6, s4 ; RV32-NEXT: add a1, a1, s9 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: add s8, s7, s8 -; RV32-NEXT: sltu a2, s8, s7 +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: add s8, s6, s8 +; RV32-NEXT: sltu a2, s8, s6 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vmv.x.s a2, v16 +; RV32-NEXT: vmv.x.s a2, v24 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: add a2, s8, a2 ; RV32-NEXT: sltu a3, a2, s8 ; RV32-NEXT: add a1, a3, a1 @@ -989,18 +999,22 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: add a0, a2, a0 ; RV32-NEXT: sltu a2, a0, a2 ; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 0(sp) # 4-byte Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: .cfi_def_cfa sp, 64 +; RV32-NEXT: lw s0, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 16(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 @@ -1013,7 +1027,7 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: .cfi_restore s9 ; RV32-NEXT: .cfi_restore s10 ; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 64 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1032,11 +1046,14 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV64-NEXT: vslidedown.vi v16, v8, 2 ; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: vslidedown.vi v16, v8, 3 -; RV64-NEXT: vmv.x.s a1, v16 -; RV64-NEXT: mv a2, sp +; RV64-NEXT: mv a1, sp +; RV64-NEXT: vmv.x.s a2, v16 +; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 32(sp) +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vredxor.vs v8, v8, v16 +; RV64-NEXT: ld a1, 32(sp) ; RV64-NEXT: ld a3, 40(sp) ; RV64-NEXT: ld a4, 48(sp) ; RV64-NEXT: ld a5, 56(sp) @@ -1048,20 +1065,17 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV64-NEXT: ld t3, 104(sp) ; RV64-NEXT: ld t4, 112(sp) ; RV64-NEXT: ld t5, 120(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s t6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, t6, a0 ; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a0, t6, a0 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: add a0, a0, a3 ; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: add a5, a5, a7 -; RV64-NEXT: add a0, a0, a5 ; RV64-NEXT: add t0, t0, t1 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: add a5, a5, a7 ; RV64-NEXT: add t0, t0, t2 +; RV64-NEXT: add a0, a0, a5 ; RV64-NEXT: add t0, t0, t3 ; RV64-NEXT: add a0, a0, t0 ; RV64-NEXT: add t4, t4, t5 @@ -1116,22 +1130,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) { ; RV32-NEXT: vslidedown.vi v12, v8, 2 ; RV32-NEXT: vmv.x.s a0, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: vmv.x.s a1, v12 -; RV32-NEXT: vmv.x.s a2, v9 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v12 ; RV32-NEXT: vslidedown.vi v12, v9, 1 ; RV32-NEXT: vmv.x.s a3, v12 ; RV32-NEXT: vslidedown.vi v12, v9, 2 ; RV32-NEXT: vmv.x.s a4, v12 ; RV32-NEXT: vslidedown.vi v9, v9, 3 -; RV32-NEXT: vmv.x.s a5, v9 -; RV32-NEXT: vmv.x.s a6, v10 +; RV32-NEXT: vmv.x.s a5, v10 +; RV32-NEXT: vmv.x.s a6, v9 ; RV32-NEXT: vslidedown.vi v9, v10, 1 ; RV32-NEXT: vmv.x.s a7, v9 ; RV32-NEXT: vslidedown.vi v9, v10, 2 ; RV32-NEXT: vmv.x.s t0, v9 ; RV32-NEXT: vslidedown.vi v9, v10, 3 -; RV32-NEXT: vmv.x.s t1, v9 -; RV32-NEXT: vmv.x.s t2, v11 +; RV32-NEXT: vmv.x.s t1, v11 +; RV32-NEXT: vmv.x.s t2, v9 ; RV32-NEXT: vslidedown.vi v9, v11, 1 ; RV32-NEXT: vmv.x.s t3, v9 ; RV32-NEXT: vslidedown.vi v9, v11, 2 @@ -1142,18 +1156,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s t6, v8 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, t6, a0 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a2, a4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: add t1, t2, t1 +; RV32-NEXT: add a0, t6, a0 +; RV32-NEXT: add a1, a1, a4 ; RV32-NEXT: add a5, a5, a7 -; RV32-NEXT: add a5, a5, t0 -; RV32-NEXT: add a0, a0, a5 -; RV32-NEXT: add t1, t1, t2 ; RV32-NEXT: add t1, t1, t3 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a5, a5, t0 ; RV32-NEXT: add t1, t1, t4 +; RV32-NEXT: add a0, a0, a5 ; RV32-NEXT: add t1, t1, t5 ; RV32-NEXT: add a0, a0, t1 ; RV32-NEXT: ret @@ -1164,22 +1178,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) { ; RV64-NEXT: vslidedown.vi v12, v8, 2 ; RV64-NEXT: vmv.x.s a0, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vmv.x.s a1, v12 -; RV64-NEXT: vmv.x.s a2, v9 +; RV64-NEXT: vmv.x.s a1, v9 +; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: vslidedown.vi v12, v9, 1 ; RV64-NEXT: vmv.x.s a3, v12 ; RV64-NEXT: vslidedown.vi v12, v9, 2 ; RV64-NEXT: vmv.x.s a4, v12 ; RV64-NEXT: vslidedown.vi v9, v9, 3 -; RV64-NEXT: vmv.x.s a5, v9 -; RV64-NEXT: vmv.x.s a6, v10 +; RV64-NEXT: vmv.x.s a5, v10 +; RV64-NEXT: vmv.x.s a6, v9 ; RV64-NEXT: vslidedown.vi v9, v10, 1 ; RV64-NEXT: vmv.x.s a7, v9 ; RV64-NEXT: vslidedown.vi v9, v10, 2 ; RV64-NEXT: vmv.x.s t0, v9 ; RV64-NEXT: vslidedown.vi v9, v10, 3 -; RV64-NEXT: vmv.x.s t1, v9 -; RV64-NEXT: vmv.x.s t2, v11 +; RV64-NEXT: vmv.x.s t1, v11 +; RV64-NEXT: vmv.x.s t2, v9 ; RV64-NEXT: vslidedown.vi v9, v11, 1 ; RV64-NEXT: vmv.x.s t3, v9 ; RV64-NEXT: vslidedown.vi v9, v11, 2 @@ -1190,18 +1204,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) { ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vredxor.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s t6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, t6, a0 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a2, a4 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: add t1, t2, t1 +; RV64-NEXT: add a0, t6, a0 +; RV64-NEXT: add a1, a1, a4 ; RV64-NEXT: add a5, a5, a7 -; RV64-NEXT: add a5, a5, t0 -; RV64-NEXT: add a0, a0, a5 -; RV64-NEXT: add t1, t1, t2 ; RV64-NEXT: add t1, t1, t3 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: add a5, a5, t0 ; RV64-NEXT: add t1, t1, t4 +; RV64-NEXT: add a0, a0, a5 ; RV64-NEXT: add t1, t1, t5 ; RV64-NEXT: addw a0, a0, t1 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index c65e7aec712ae..66af5718fb9dc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -54,12 +54,10 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; V128-NEXT: vmv1r.v v12, v9 ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; V128-NEXT: vid.v v9 +; V128-NEXT: vmv.v.i v0, 10 ; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t ; V128-NEXT: vmv.v.v v8, v10 ; V128-NEXT: ret @@ -191,10 +189,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V128-NEXT: vwaddu.vv v10, v8, v8 ; V128-NEXT: li a0, -1 +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vid.v v11 +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V128-NEXT: vwmaccu.vx v10, a0, v8 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; V128-NEXT: vid.v v8 -; V128-NEXT: vsrl.vi v8, v8, 1 +; V128-NEXT: vsrl.vi v8, v11, 1 ; V128-NEXT: vmv.v.i v0, 10 ; V128-NEXT: vadd.vi v8, v8, 1 ; V128-NEXT: vrgather.vv v10, v9, v8, v0.t @@ -206,10 +206,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V512-NEXT: vwaddu.vv v10, v8, v8 ; V512-NEXT: li a0, -1 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vid.v v11 +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; V512-NEXT: vwmaccu.vx v10, a0, v8 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu -; V512-NEXT: vid.v v8 -; V512-NEXT: vsrl.vi v8, v8, 1 +; V512-NEXT: vsrl.vi v8, v11, 1 ; V512-NEXT: vmv.v.i v0, 10 ; V512-NEXT: vadd.vi v8, v8, 1 ; V512-NEXT: vrgather.vv v10, v9, v8, v0.t @@ -409,26 +411,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 ; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; V128-NEXT: vmv8r.v v0, v16 -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; V128-NEXT: vmv8r.v v24, v16 ; V128-NEXT: vmv8r.v v16, v8 +; V128-NEXT: vmv8r.v v8, v24 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v8, v0, 16 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v0, v8 +; V128-NEXT: vslidedown.vi v0, v24, 16 ; V128-NEXT: li a0, -1 -; V128-NEXT: vwmaccu.vx v24, a0, v8 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v24, v8, v0 +; V128-NEXT: vwmaccu.vx v24, a0, v0 ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: lui a1, 699051 +; V128-NEXT: li a2, 32 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v8, v0, v16 -; V128-NEXT: vwmaccu.vx v8, a0, v16 -; V128-NEXT: lui a1, 699051 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: li a1, 32 -; V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; V128-NEXT: vwmaccu.vx v8, a0, v16 +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; V128-NEXT: vmerge.vvm v24, v8, v24, v0 ; V128-NEXT: addi a1, sp, 16 ; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 1e77b3710928d..e46587f58b4eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -100,8 +100,8 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vrsub.vi v10, v9, 4 ; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vrsub.vi v10, v9, 4 ; CHECK-NEXT: vmv.v.i v9, 5 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 @@ -116,8 +116,8 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmul.vx v10, v9, a0 ; CHECK-NEXT: vmv.v.i v0, 3 +; CHECK-NEXT: vmul.vx v10, v9, a0 ; CHECK-NEXT: vmv.v.i v9, 5 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 @@ -157,38 +157,40 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) { define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV32-LABEL: vrgather_shuffle_vv_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vmv.v.i v16, 2 -; RV32-NEXT: li a0, 5 -; RV32-NEXT: lui a1, %hi(.LCPI11_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI11_0) -; RV32-NEXT: vle16.v v20, (a1) -; RV32-NEXT: vslide1down.vx v21, v16, a0 +; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: vmv.v.i v21, 2 ; RV32-NEXT: li a0, 164 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v8, v20 ; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32-NEXT: vslide1down.vx v8, v21, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v8, v20 -; RV32-NEXT: vrgatherei16.vv v16, v12, v21, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v12, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vv_v8i64: ; RV64: # %bb.0: +; RV64-NEXT: li a0, 164 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, 327683 ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: addi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 17 ; RV64-NEXT: addi a0, a0, 1 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v20, a0 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v8, v20 -; RV64-NEXT: li a0, 164 -; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, 163841 ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: addi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 17 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vrgatherei16.vv v16, v8, v20 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -205,15 +207,15 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; RV32-NEXT: lui a0, %hi(.LCPI12_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vmv.v.i v20, -1 +; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: lui a0, %hi(.LCPI12_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) -; RV32-NEXT: vle16.v v17, (a0) +; RV32-NEXT: vle16.v v21, (a0) ; RV32-NEXT: li a0, 113 ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v12, v20, v16 -; RV32-NEXT: vrgatherei16.vv v12, v8, v17, v0.t +; RV32-NEXT: vrgatherei16.vv v12, v16, v20 +; RV32-NEXT: vrgatherei16.vv v12, v8, v21, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; @@ -241,14 +243,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vrgatherei16.vv v12, v8, v16 ; RV32-NEXT: lui a0, %hi(.LCPI13_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) -; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vle16.v v17, (a0) ; RV32-NEXT: li a0, 140 ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vmv.v.i v16, 5 -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v12, v8, v16 +; RV32-NEXT: vmv.v.i v8, 5 +; RV32-NEXT: vrgatherei16.vv v12, v8, v17, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; @@ -435,9 +437,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 4 ; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: li a0, 70 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v11, v10, 2 -; CHECK-NEXT: li a0, 70 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 @@ -454,13 +456,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 6 ; CHECK-NEXT: vmv.v.i v11, 0 -; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v11, v10, 5 ; CHECK-NEXT: lui a0, 8256 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: li a0, 98 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v11, v10, 5 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vv v10, v8, v12 @@ -722,17 +724,18 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) { ; CHECK-LABEL: shuffle_v64i8_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 240 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: lui a1, 98561 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vsll.vi v14, v12, 3 ; CHECK-NEXT: vrgather.vv v12, v8, v14 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: li a1, 240 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: lui a1, 98561 ; CHECK-NEXT: addi a1, a1, -2048 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu ; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index f894691b993e4..cba8de82ec41b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1045,47 +1045,47 @@ define void @urem_v2i64(ptr %x, ptr %y) { define void @mulhu_v16i8(ptr %x) { ; CHECK-LABEL: mulhu_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: lui a1, 3 -; CHECK-NEXT: addi a1, a1, -2044 -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: addi a2, a1, 32 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: lui a2, %hi(.LCPI65_0) ; CHECK-NEXT: addi a2, a2, %lo(.LCPI65_0) ; CHECK-NEXT: vle8.v v11, (a2) -; CHECK-NEXT: li a2, -128 +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: addi a1, a1, -2044 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: addi a1, a2, 32 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: li a1, -128 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vxm v12, v10, a2, v0 +; CHECK-NEXT: vmerge.vxm v12, v10, a1, v0 +; CHECK-NEXT: li a1, 513 +; CHECK-NEXT: vmv.v.i v13, 4 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 -; CHECK-NEXT: vsrl.vv v8, v9, v8 -; CHECK-NEXT: vmulhu.vv v8, v8, v11 -; CHECK-NEXT: vsub.vv v9, v9, v8 -; CHECK-NEXT: vmulhu.vv v9, v9, v12 -; CHECK-NEXT: vadd.vv v9, v9, v8 -; CHECK-NEXT: li a2, 513 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a2 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: addi a1, a2, 78 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, 4 -; CHECK-NEXT: vmerge.vim v10, v8, 1, v0 -; CHECK-NEXT: addi a1, a1, 78 +; CHECK-NEXT: vmerge.vim v10, v13, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vsrl.vv v8, v9, v8 +; CHECK-NEXT: vmulhu.vv v8, v8, v11 +; CHECK-NEXT: vmerge.vim v10, v10, 3, v0 ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: addi a1, a1, 304 -; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vsub.vv v9, v9, v8 +; CHECK-NEXT: vmulhu.vv v9, v9, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v10, v10, 3, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 2, v0 -; CHECK-NEXT: vsrl.vv v8, v9, v8 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vim v9, v10, 2, v0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x @@ -1100,31 +1100,31 @@ define void @mulhu_v8i16(ptr %x) { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v10, a1 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a1, 33 +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, %hi(.LCPI66_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0) -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vmv.v.i v12, 1 +; CHECK-NEXT: vmv.v.i v11, 3 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vmerge.vim v11, v11, 2, v0 +; CHECK-NEXT: vmv.v.i v13, 0 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v12, 6 +; CHECK-NEXT: vslideup.vi v9, v10, 6 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vv v9, v8, v9 -; CHECK-NEXT: vmulhu.vv v9, v9, v11 +; CHECK-NEXT: vmulhu.vv v9, v9, v12 +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.s.x v13, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: vmulhu.vv v8, v8, v10 +; CHECK-NEXT: vmulhu.vv v8, v8, v13 ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: li a1, 33 -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v12, 6 +; CHECK-NEXT: vslideup.vi v11, v10, 6 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v11 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x @@ -1157,22 +1157,22 @@ define void @mulhu_v4i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI68_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0) -; CHECK-NEXT: vle32.v v9, (a1) ; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: lui a1, %hi(.LCPI68_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0) +; CHECK-NEXT: vle32.v v11, (a1) ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v11, v10, 2 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmulhu.vv v9, v8, v9 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: vmulhu.vv v8, v8, v11 -; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vslideup.vi v9, v10, 2 ; CHECK-NEXT: lui a1, 4128 ; CHECK-NEXT: addi a1, a1, 514 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmulhu.vv v10, v8, v11 +; CHECK-NEXT: vsub.vv v8, v8, v10 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 ; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: vsext.vf4 v10, v9 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vse32.v v8, (a0) @@ -1192,10 +1192,10 @@ define void @mulhu_v2i64(ptr %x) { ; RV32-NEXT: addi a1, a1, %lo(.LCPI69_0) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v9, (a1) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmulhu.vv v8, v8, v9 ; RV32-NEXT: lui a1, 32 ; RV32-NEXT: addi a1, a1, 1 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmulhu.vv v8, v8, v9 ; RV32-NEXT: vmv.s.x v9, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vsext.vf4 v10, v9 @@ -1209,16 +1209,16 @@ define void @mulhu_v2i64(ptr %x) { ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 838861 +; RV64-NEXT: lui a2, 699051 ; RV64-NEXT: addiw a1, a1, -819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: addiw a2, a2, -1365 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: vmv.v.x v9, a1 -; RV64-NEXT: lui a1, 699051 -; RV64-NEXT: addiw a1, a1, -1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: vmv.s.x v9, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV64-NEXT: vmulhu.vv v8, v8, v9 ; RV64-NEXT: vid.v v9 @@ -1246,9 +1246,9 @@ define void @mulhs_v16i8(ptr %x) { ; CHECK-NEXT: li a1, 57 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0 +; CHECK-NEXT: vmv.v.i v10, 7 ; CHECK-NEXT: vmulhu.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.i v9, 7 -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 +; CHECK-NEXT: vmerge.vim v9, v10, 1, v0 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret @@ -1263,11 +1263,11 @@ define void @mulhs_v8i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a1, 105 +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 5 ; CHECK-NEXT: addi a1, a1, -1755 ; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: li a1, 105 -; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 1048571 ; CHECK-NEXT: addi a1, a1, 1755 ; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0 @@ -1309,9 +1309,9 @@ define void @mulhs_v4i32(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a1, 419430 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: addi a1, a1, 1639 ; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a1, 629146 ; RV32-NEXT: addi a1, a1, -1639 ; RV32-NEXT: vmerge.vxm v9, v9, a1, v0 @@ -1349,28 +1349,27 @@ define void @mulhs_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a2, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a2 +; RV32-NEXT: vid.v v9 +; RV32-NEXT: addi a2, a1, 1365 +; RV32-NEXT: vmv.v.x v10, a2 +; RV32-NEXT: li a2, 63 ; RV32-NEXT: addi a1, a1, 1366 ; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, ma -; RV32-NEXT: vmv.s.x v9, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmulh.vv v9, v8, v9 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vid.v v10 -; RV32-NEXT: vsrl.vi v10, v10, 1 -; RV32-NEXT: vrsub.vi v10, v10, 0 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmadd.vv v10, v8, v9 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: vmv.s.x v10, a1 ; RV32-NEXT: lui a1, 16 -; RV32-NEXT: vmv.s.x v9, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v9, 1 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmulh.vv v10, v8, v10 +; RV32-NEXT: vmadd.vv v9, v8, v10 +; RV32-NEXT: vmv.s.x v8, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vsext.vf4 v11, v9 +; RV32-NEXT: vsext.vf4 v10, v8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsra.vv v9, v10, v11 +; RV32-NEXT: vsrl.vx v8, v9, a2 +; RV32-NEXT: vsra.vv v9, v9, v10 ; RV32-NEXT: vadd.vv v8, v9, v8 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret @@ -1381,21 +1380,21 @@ define void @mulhs_v2i64(ptr %x) { ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 349525 ; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: lui a2, %hi(.LCPI74_0) +; RV64-NEXT: vid.v v9 ; RV64-NEXT: ld a2, %lo(.LCPI74_0)(a2) -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: li a1, 63 +; RV64-NEXT: vrsub.vi v11, v9, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v9, a2 +; RV64-NEXT: vmv.s.x v10, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64-NEXT: vmulh.vv v9, v8, v9 -; RV64-NEXT: vid.v v10 -; RV64-NEXT: vrsub.vi v11, v10, 0 -; RV64-NEXT: vmadd.vv v11, v8, v9 -; RV64-NEXT: li a1, 63 +; RV64-NEXT: vmulh.vv v10, v8, v10 +; RV64-NEXT: vmadd.vv v11, v8, v10 ; RV64-NEXT: vsrl.vx v8, v11, a1 -; RV64-NEXT: vsra.vv v9, v11, v10 +; RV64-NEXT: vsra.vv v9, v11, v9 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret @@ -3156,47 +3155,47 @@ define void @mulhu_v32i8(ptr %x) { ; CHECK-LABEL: mulhu_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: lui a2, 163907 +; CHECK-NEXT: addi a2, a2, -2044 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v0, a2 +; CHECK-NEXT: lui a2, 66049 +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: vmv.s.x v8, a2 +; CHECK-NEXT: li a2, -128 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: lui a1, 163907 -; CHECK-NEXT: addi a1, a1, -2044 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: lui a1, 66049 -; CHECK-NEXT: addi a1, a1, 32 -; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vmerge.vxm v10, v12, a2, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI181_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI181_0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: li a1, -128 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vxm v16, v12, a1, v0 +; CHECK-NEXT: vle8.v v14, (a0) ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 -; CHECK-NEXT: vsrl.vv v8, v10, v8 -; CHECK-NEXT: vmulhu.vv v8, v8, v14 -; CHECK-NEXT: vsub.vv v10, v10, v8 -; CHECK-NEXT: vmulhu.vv v10, v10, v16 -; CHECK-NEXT: vadd.vv v10, v10, v8 +; CHECK-NEXT: vle8.v v12, (a1) ; CHECK-NEXT: lui a1, 8208 ; CHECK-NEXT: addi a1, a1, 513 +; CHECK-NEXT: vsrl.vv v8, v14, v8 +; CHECK-NEXT: vmulhu.vv v12, v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 4 -; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 ; CHECK-NEXT: lui a1, 66785 ; CHECK-NEXT: addi a1, a1, 78 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: lui a1, 529160 -; CHECK-NEXT: addi a1, a1, 304 ; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: lui a1, 529160 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v12, v12, 3, v0 +; CHECK-NEXT: vsub.vv v14, v14, v12 +; CHECK-NEXT: vmulhu.vv v10, v14, v10 +; CHECK-NEXT: vmv.v.i v14, 4 +; CHECK-NEXT: addi a1, a1, 304 +; CHECK-NEXT: vmerge.vim v14, v14, 1, v0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v12, 2, v0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v14, v14, 3, v0 +; CHECK-NEXT: vadd.vv v10, v10, v12 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v14, 2, v0 ; CHECK-NEXT: vsrl.vv v8, v10, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret @@ -3212,36 +3211,37 @@ define void @mulhu_v16i16(ptr %x) { ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v10, (a0) ; RV32-NEXT: li a1, 257 -; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: lui a1, 1048568 ; RV32-NEXT: vmerge.vxm v12, v8, a1, v0 ; RV32-NEXT: lui a1, 4 +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v14, 0 ; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vmv.s.x v8, a1 -; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: lui a1, 2 +; RV32-NEXT: addi a1, a1, 289 +; RV32-NEXT: vmv.s.x v9, a1 ; RV32-NEXT: lui a1, %hi(.LCPI182_0) ; RV32-NEXT: addi a1, a1, %lo(.LCPI182_0) -; RV32-NEXT: vle16.v v14, (a1) +; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v15, 3 ; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32-NEXT: vmerge.vim v14, v14, 1, v0 +; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vmerge.vim v9, v15, 2, v0 +; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vmerge.vim v8, v9, 1, v0 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vsext.vf2 v16, v9 -; RV32-NEXT: vsrl.vv v16, v10, v16 -; RV32-NEXT: vmulhu.vv v14, v16, v14 +; RV32-NEXT: vsext.vf2 v18, v14 +; RV32-NEXT: vsrl.vv v14, v10, v18 +; RV32-NEXT: vmulhu.vv v14, v14, v16 ; RV32-NEXT: vsub.vv v10, v10, v14 ; RV32-NEXT: vmulhu.vv v10, v10, v12 ; RV32-NEXT: vadd.vv v10, v10, v14 -; RV32-NEXT: lui a1, 2 -; RV32-NEXT: addi a1, a1, 289 -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 3 -; RV32-NEXT: vmerge.vim v9, v9, 2, v0 -; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v8, v9, 1, v0 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vsext.vf2 v12, v8 ; RV32-NEXT: vsrl.vv v8, v10, v12 ; RV32-NEXT: vse16.v v8, (a0) @@ -3252,31 +3252,31 @@ define void @mulhu_v16i16(ptr %x) { ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: li a1, 257 -; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: lui a1, 1048568 -; RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: lui a1, %hi(.LCPI182_0) ; RV64-NEXT: addi a1, a1, %lo(.LCPI182_0) ; RV64-NEXT: vle16.v v12, (a1) +; RV64-NEXT: lui a1, 1048568 +; RV64-NEXT: vmerge.vxm v10, v10, a1, v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: slli a1, a1, 48 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v14, a1 +; RV64-NEXT: lui a1, %hi(.LCPI182_1) +; RV64-NEXT: ld a1, %lo(.LCPI182_1)(a1) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vsext.vf2 v16, v14 ; RV64-NEXT: vsrl.vv v14, v8, v16 ; RV64-NEXT: vmulhu.vv v12, v14, v12 -; RV64-NEXT: lui a1, %hi(.LCPI182_1) -; RV64-NEXT: ld a1, %lo(.LCPI182_1)(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v14, a1 +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: vmulhu.vv v8, v8, v10 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vsext.vf2 v12, v10 -; RV64-NEXT: vsrl.vv v8, v8, v12 +; RV64-NEXT: vsext.vf2 v10, v14 +; RV64-NEXT: vsrl.vv v8, v8, v10 ; RV64-NEXT: vse16.v v8, (a0) ; RV64-NEXT: ret %a = load <16 x i16>, ptr %x @@ -3291,22 +3291,22 @@ define void @mulhu_v8i32(ptr %x) { ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: li a1, 68 +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, %hi(.LCPI183_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI183_0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vle32.v v12, (a1) ; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0 -; CHECK-NEXT: vmulhu.vv v10, v8, v10 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: vmulhu.vv v8, v8, v12 -; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vxm v10, v10, a1, v0 ; CHECK-NEXT: lui a1, 4128 ; CHECK-NEXT: addi a1, a1, 514 +; CHECK-NEXT: vmulhu.vv v12, v8, v12 +; CHECK-NEXT: vsub.vv v8, v8, v12 +; CHECK-NEXT: vmulhu.vv v8, v8, v10 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: vsext.vf4 v12, v10 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vse32.v v8, (a0) @@ -3326,24 +3326,22 @@ define void @mulhu_v4i64(ptr %x) { ; RV32-NEXT: addi a1, a1, %lo(.LCPI184_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a1) -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmulhu.vv v10, v8, v10 ; RV32-NEXT: lui a1, 524288 -; RV32-NEXT: vmv.s.x v12, a1 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v14, 0 -; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV32-NEXT: vslideup.vi v14, v12, 5 +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv.s.x v14, a1 ; RV32-NEXT: lui a1, %hi(.LCPI184_1) ; RV32-NEXT: addi a1, a1, %lo(.LCPI184_1) +; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; RV32-NEXT: vslideup.vi v12, v14, 5 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vle8.v v12, (a1) +; RV32-NEXT: vle8.v v14, (a1) ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmulhu.vv v10, v8, v10 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vmulhu.vv v8, v8, v14 +; RV32-NEXT: vmulhu.vv v8, v8, v12 ; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vsext.vf4 v10, v12 +; RV32-NEXT: vsext.vf4 v10, v14 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vsrl.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) @@ -3356,22 +3354,22 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: lui a1, %hi(.LCPI184_0) ; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0) ; RV64-NEXT: vle64.v v10, (a1) -; RV64-NEXT: vmulhu.vv v10, v8, v10 -; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: slli a1, a1, 63 -; RV64-NEXT: vmv.s.x v12, a1 -; RV64-NEXT: vmv.v.i v14, 0 +; RV64-NEXT: vmv.s.x v14, a1 +; RV64-NEXT: lui a1, 12320 +; RV64-NEXT: addi a1, a1, 513 ; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV64-NEXT: vslideup.vi v14, v12, 2 +; RV64-NEXT: vslideup.vi v12, v14, 2 +; RV64-NEXT: vmv.s.x v14, a1 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmulhu.vv v8, v8, v14 +; RV64-NEXT: vmulhu.vv v10, v8, v10 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vmulhu.vv v8, v8, v12 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a1, 12320 -; RV64-NEXT: addi a1, a1, 513 -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vsext.vf8 v12, v10 -; RV64-NEXT: vsrl.vv v8, v8, v12 +; RV64-NEXT: vsext.vf8 v10, v14 +; RV64-NEXT: vsrl.vv v8, v8, v10 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret %a = load <4 x i64>, ptr %x @@ -3384,16 +3382,16 @@ define void @mulhs_v32i8(ptr %x) { ; CHECK-LABEL: mulhs_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: lui a2, 304453 +; CHECK-NEXT: addi a2, a2, -1452 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v0, a2 +; CHECK-NEXT: li a2, -123 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: lui a1, 304453 -; CHECK-NEXT: addi a1, a1, -1452 -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 7 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 -; CHECK-NEXT: li a1, -123 -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v12, a2 ; CHECK-NEXT: li a1, 57 ; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0 ; CHECK-NEXT: vmulhu.vv v8, v8, v12 @@ -3437,11 +3435,11 @@ define void @mulhs_v8i32(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a1, 85 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: lui a1, 419430 ; RV32-NEXT: addi a1, a1, 1639 ; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: li a1, 85 -; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: lui a1, 629146 ; RV32-NEXT: addi a1, a1, -1639 ; RV32-NEXT: vmerge.vxm v10, v10, a1, v0 @@ -3479,63 +3477,61 @@ define void @mulhs_v4i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a2, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a2 ; RV32-NEXT: li a2, 17 ; RV32-NEXT: vmv.s.x v0, a2 -; RV32-NEXT: addi a1, a1, 1366 -; RV32-NEXT: vmerge.vxm v10, v10, a1, v0 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmulh.vv v10, v8, v10 -; RV32-NEXT: lui a1, 1048560 +; RV32-NEXT: lui a2, 1048560 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a2 +; RV32-NEXT: addi a2, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vsext.vf4 v14, v12 +; RV32-NEXT: vmv.v.x v12, a2 +; RV32-NEXT: li a2, 63 +; RV32-NEXT: addi a1, a1, 1366 +; RV32-NEXT: vmerge.vxm v12, v12, a1, v0 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: vsext.vf4 v14, v10 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmulh.vv v10, v8, v12 ; RV32-NEXT: vmadd.vv v14, v8, v10 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsrl.vx v8, v14, a1 -; RV32-NEXT: lui a1, 16 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vsext.vf4 v12, v10 +; RV32-NEXT: vsext.vf4 v10, v8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vsra.vv v10, v14, v12 +; RV32-NEXT: vsrl.vx v8, v14, a2 +; RV32-NEXT: vsra.vv v10, v14, v10 ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: mulhs_v4i64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: lui a2, 1044496 ; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: addi a2, a2, -256 +; RV64-NEXT: vmv.s.x v10, a2 ; RV64-NEXT: slli a2, a1, 32 ; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: lui a2, %hi(.LCPI188_0) ; RV64-NEXT: ld a2, %lo(.LCPI188_0)(a2) -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v0, 5 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vmerge.vxm v10, v10, a2, v0 -; RV64-NEXT: vmulh.vv v10, v8, v10 -; RV64-NEXT: lui a1, 1044496 -; RV64-NEXT: addi a1, a1, -256 -; RV64-NEXT: vmv.s.x v12, a1 -; RV64-NEXT: vsext.vf8 v14, v12 -; RV64-NEXT: vmadd.vv v14, v8, v10 +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: li a1, 63 +; RV64-NEXT: vmerge.vxm v12, v12, a2, v0 +; RV64-NEXT: lui a2, 4096 +; RV64-NEXT: addi a2, a2, 256 +; RV64-NEXT: vsext.vf8 v14, v10 +; RV64-NEXT: vmulh.vv v10, v8, v12 +; RV64-NEXT: vmadd.vv v14, v8, v10 +; RV64-NEXT: vmv.s.x v8, a2 +; RV64-NEXT: vsext.vf8 v10, v8 ; RV64-NEXT: vsrl.vx v8, v14, a1 -; RV64-NEXT: lui a1, 4096 -; RV64-NEXT: addi a1, a1, 256 -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vsext.vf8 v12, v10 -; RV64-NEXT: vsra.vv v10, v14, v12 +; RV64-NEXT: vsra.vv v10, v14, v10 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret @@ -5632,12 +5628,12 @@ define void @mulhs_vx_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a2, a1, 1365 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a1, 1365 ; RV32-NEXT: addi a1, a1, 1366 ; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a2, 12(sp) -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vlse64.v v9, (a1), zero +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: vlse64.v v9, (a2), zero ; RV32-NEXT: vmulh.vv v8, v8, v9 ; RV32-NEXT: li a1, 63 ; RV32-NEXT: vsrl.vx v9, v8, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 67c18b5eef736..123e224364795 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -9,9 +9,9 @@ define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr %p) { ; CHECK-LABEL: vector_interleave_store_v32i1_v16i1: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v0, v8, 2 -; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll index af46849ae0871..30e41f2f526e5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll @@ -15,32 +15,37 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-NEXT: ld a6, 56(a0) ; ZVE32X-NEXT: ld a7, 72(a0) ; ZVE32X-NEXT: ld a0, 80(a0) +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmv.s.x v9, zero +; ZVE32X-NEXT: vmv.v.i v10, 0 ; ZVE32X-NEXT: xor a3, a3, a4 +; ZVE32X-NEXT: xor a1, a1, a2 +; ZVE32X-NEXT: xor a2, a5, a6 +; ZVE32X-NEXT: xor a0, a7, a0 ; ZVE32X-NEXT: snez a3, a3 -; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: snez a1, a1 +; ZVE32X-NEXT: snez a2, a2 +; ZVE32X-NEXT: snez a0, a0 ; ZVE32X-NEXT: vmv.s.x v8, a3 +; ZVE32X-NEXT: vmv.s.x v11, a1 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v8, v8, 1 +; ZVE32X-NEXT: vand.vi v11, v11, 1 ; ZVE32X-NEXT: vmsne.vi v0, v8, 0 -; ZVE32X-NEXT: vmv.s.x v9, zero -; ZVE32X-NEXT: vmerge.vim v8, v9, 1, v0 -; ZVE32X-NEXT: xor a1, a1, a2 -; ZVE32X-NEXT: snez a1, a1 -; ZVE32X-NEXT: vmv.s.x v10, a1 -; ZVE32X-NEXT: vand.vi v10, v10, 1 -; ZVE32X-NEXT: vmsne.vi v0, v10, 0 +; ZVE32X-NEXT: vmsne.vi v8, v11, 0 +; ZVE32X-NEXT: vmerge.vim v11, v9, 1, v0 +; ZVE32X-NEXT: vmv1r.v v0, v8 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmv.v.i v10, 0 -; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0 +; ZVE32X-NEXT: vmerge.vim v8, v10, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; ZVE32X-NEXT: vslideup.vi v11, v8, 1 +; ZVE32X-NEXT: vslideup.vi v8, v11, 1 +; ZVE32X-NEXT: vmv.s.x v11, a2 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vand.vi v11, v11, 1 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmsne.vi v0, v11, 0 -; ZVE32X-NEXT: xor a1, a5, a6 -; ZVE32X-NEXT: snez a1, a1 -; ZVE32X-NEXT: vmv.s.x v8, a1 +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; ZVE32X-NEXT: vand.vi v8, v8, 1 -; ZVE32X-NEXT: vmsne.vi v8, v8, 0 +; ZVE32X-NEXT: vmsne.vi v8, v11, 0 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0 ; ZVE32X-NEXT: vmv1r.v v0, v8 @@ -48,13 +53,12 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-NEXT: vmerge.vim v8, v9, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma ; ZVE32X-NEXT: vslideup.vi v11, v8, 2 -; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmsne.vi v0, v11, 0 -; ZVE32X-NEXT: xor a0, a7, a0 -; ZVE32X-NEXT: snez a0, a0 ; ZVE32X-NEXT: vmv.s.x v8, a0 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v8, v8, 1 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v11, 0 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vmsne.vi v8, v8, 0 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index b56814ea4c372..fa1377406d697 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -12,9 +12,9 @@ define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a0) +; RV32-NEXT: li a0, 32 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vnsrl.wi v8, v10, 0 -; RV32-NEXT: li a0, 32 ; RV32-NEXT: vnsrl.wx v9, v10, a0 ; RV32-NEXT: ret ; @@ -183,129 +183,107 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 84 +; RV32-NEXT: li a3, 81 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb -; RV32-NEXT: addi a3, a1, 256 +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 81 * vlenb +; RV32-NEXT: addi a3, a1, 128 +; RV32-NEXT: addi a4, a1, 256 ; RV32-NEXT: li a2, 32 +; RV32-NEXT: lui a5, 12 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a3) -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 76 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v4, v8, 4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v4, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 12 -; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vle32.v v16, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: li a6, 57 +; RV32-NEXT: mul a4, a4, a6 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v4, v8, 10, v0.t +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, %hi(.LCPI8_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI8_0) +; RV32-NEXT: vmv.s.x v1, a5 +; RV32-NEXT: lui a5, %hi(.LCPI8_1) +; RV32-NEXT: addi a5, a5, %lo(.LCPI8_1) +; RV32-NEXT: vle16.v v4, (a4) +; RV32-NEXT: lui a4, 1 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vslideup.vi v12, v16, 4 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 37 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v16, 16 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 45 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v12, v16, 10, v0.t +; RV32-NEXT: vmv.v.v v28, v12 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v0, (a4) -; RV32-NEXT: lui a4, %hi(.LCPI8_1) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_1) -; RV32-NEXT: lui a5, 1 -; RV32-NEXT: vle16.v v8, (a4) -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a4, a4, a6 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v24, (a5) ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 68 -; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: slli a5, a1, 6 +; RV32-NEXT: add a1, a5, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v24, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, a5, -64 -; RV32-NEXT: vmv.s.x v16, a1 +; RV32-NEXT: vle32.v v16, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 +; RV32-NEXT: li a3, 73 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vrgatherei16.vv v16, v8, v0 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, a4, -64 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v16, v8, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 73 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v4, v16 +; RV32-NEXT: vmv.v.v v28, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v12, v8, 2 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vslideup.vi v12, v16, 8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -318,7 +296,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -328,34 +306,34 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 6 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vrgatherei16.vv v24, v16, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 73 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v24, v8, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -363,13 +341,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -378,15 +356,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v24, v2 ; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vslideup.vi v8, v24, 6, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill @@ -398,22 +376,18 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v28, a1 ; RV32-NEXT: vrgatherei16.vv v8, v16, v24 +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 73 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -423,70 +397,78 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a3, %hi(.LCPI8_8) ; RV32-NEXT: addi a3, a3, %lo(.LCPI8_8) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI8_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: vle16.v v12, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 2 +; RV32-NEXT: li a4, 13 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs4r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v8, v16 +; RV32-NEXT: vrgatherei16.vv v20, v16, v8 ; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v16, 4, v0.t +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v24, v8 +; RV32-NEXT: vslideup.vi v20, v8, 4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 6 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v20 +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a3, 73 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -497,21 +479,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v3, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v8, v24, 6 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v8, v16, 6 ; RV32-NEXT: vmv1r.v v0, v3 -; RV32-NEXT: vrgatherei16.vv v8, v16, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv4r.v v24, v16 ; RV32-NEXT: lui a1, %hi(.LCPI8_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_11) ; RV32-NEXT: lui a3, %hi(.LCPI8_12) @@ -527,21 +508,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 6 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 73 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -560,7 +542,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -568,13 +550,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -582,8 +564,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 68 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 6 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload @@ -595,7 +577,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 60 +; RV32-NEXT: li a2, 73 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -604,26 +586,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 21 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 13 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 76 +; RV32-NEXT: li a2, 57 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload @@ -640,21 +624,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 36 +; RV32-NEXT: li a2, 41 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 84 +; RV32-NEXT: li a1, 81 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 @@ -667,141 +651,130 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 6 -; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: slli a2, a2, 6 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb -; RV64-NEXT: addi a2, a1, 256 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v16, (a2) -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 21 -; RV64-NEXT: mul a2, a2, a3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb ; RV64-NEXT: addi a2, a1, 128 +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 48 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, a1, 256 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 57 +; RV64-NEXT: li a3, 20 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 128 +; RV64-NEXT: vid.v v10 +; RV64-NEXT: vmv.s.x v1, a1 +; RV64-NEXT: li a1, 6 +; RV64-NEXT: vmul.vx v2, v10, a1 +; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vrgather.vi v12, v16, 4 -; RV64-NEXT: li a1, 128 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v16, 8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 37 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 36 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vi v12, v16, 2, v0.t -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vid.v v10 -; RV64-NEXT: li a1, 6 -; RV64-NEXT: vmul.vx v8, v10, a1 -; RV64-NEXT: li a1, 56 -; RV64-NEXT: vle64.v v24, (a2) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 45 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vadd.vi v10, v8, -16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v7, a1 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v10, v2, -16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v0, v8 -; RV64-NEXT: vmv2r.v v4, v8 +; RV64-NEXT: vrgatherei16.vv v24, v16, v2 +; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v6, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv1r.v v0, v6 -; RV64-NEXT: vrgatherei16.vv v16, v24, v10, v0.t +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v16, v10, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v16 +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 +; RV64-NEXT: li a2, 20 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v8, 5 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v12, v16, 5 ; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 +; RV64-NEXT: li a2, 36 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vi v12, v16, 3, v0.t -; RV64-NEXT: vmv.v.v v28, v12 +; RV64-NEXT: vmv.v.v v20, v12 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v24, v4, 1 -; RV64-NEXT: vadd.vi v26, v4, -15 +; RV64-NEXT: vadd.vi v16, v2, 1 +; RV64-NEXT: vadd.vi v18, v2, -15 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v8, v24 -; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vrgatherei16.vv v24, v8, v16 +; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v16, v8, v26, v0.t +; RV64-NEXT: vrgatherei16.vv v24, v8, v18, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v28, v16 +; RV64-NEXT: vmv.v.v v20, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 13 +; RV64-NEXT: li a2, 12 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 16 ; RV64-NEXT: addi a1, a1, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.i v9, 6 ; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 +; RV64-NEXT: li a2, 20 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -809,72 +782,66 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vrgatherei16.vv v12, v16, v9 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 44 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vrgatherei16.vv v12, v16, v10 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv4r.v v8, v16 ; RV64-NEXT: vrgather.vi v12, v16, 2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v12, v16, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 29 +; RV64-NEXT: li a2, 28 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: li a1, 24 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v7, a1 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v16, v4, 2 -; RV64-NEXT: vadd.vi v2, v4, -14 +; RV64-NEXT: vadd.vi v10, v2, 2 +; RV64-NEXT: vadd.vi v4, v2, -14 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v24, v16 +; RV64-NEXT: vrgatherei16.vv v24, v16, v10 +; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v2, v0.t +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v8, v4, v0.t ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 20 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 +; RV64-NEXT: li a2, 36 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 44 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -882,194 +849,168 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vi v28, v24, 4, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 44 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv2r.v v8, v4 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v4, v4, 3 -; RV64-NEXT: vadd.vi v6, v8, -13 -; RV64-NEXT: vmv2r.v v2, v8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v30, v2, 3 +; RV64-NEXT: vadd.vi v28, v2, -13 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v24, v4 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v6, v0.t +; RV64-NEXT: vrgatherei16.vv v8, v16, v30 +; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv4r.v v16, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v4, v16, 5, v0.t +; RV64-NEXT: vrgather.vi v4, v24, 5, v0.t ; RV64-NEXT: lui a1, 96 ; RV64-NEXT: li a2, 192 -; RV64-NEXT: vmv.s.x v1, a2 +; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV64-NEXT: vrgatherei16.vv v12, v24, v9, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: li a1, 28 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v1, a1 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v22, v2, 4 -; RV64-NEXT: vadd.vi v20, v2, -12 +; RV64-NEXT: vadd.vi v10, v2, 4 +; RV64-NEXT: vadd.vi v12, v2, -12 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v24, v22 +; RV64-NEXT: vrgatherei16.vv v16, v24, v10 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV64-NEXT: vrgatherei16.vv v16, v24, v12, v0.t ; RV64-NEXT: lui a1, 112 ; RV64-NEXT: addi a1, a1, 1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v12, a1 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 29 +; RV64-NEXT: li a2, 28 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 36 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v20, v16, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v12, v24, v9, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 29 +; RV64-NEXT: li a2, 28 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 44 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 20 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v24 +; RV64-NEXT: vmv.v.v v20, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 53 +; RV64-NEXT: li a2, 44 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v12, v2, 5 +; RV64-NEXT: vadd.vi v20, v2, 5 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v16, v12 +; RV64-NEXT: vrgatherei16.vv v8, v24, v20 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v12, v2, -11 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v20, v2, -11 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v8, v24, v20, v0.t ; RV64-NEXT: vmv4r.v v12, v4 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v12, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: vmv.v.v v20, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 29 +; RV64-NEXT: li a2, 28 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v16, v8 ; RV64-NEXT: addi a1, a0, 320 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vse64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: vse64.v v20, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 53 +; RV64-NEXT: li a3, 44 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -1077,22 +1018,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 13 +; RV64-NEXT: li a3, 12 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 6 -; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: slli a0, a0, 6 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index eb5da36116af3..f27614c93985f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -88,13 +88,13 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v9 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v9 ; RV64-NEXT: fcvt.l.s a1, fa5 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: ret %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x) ret <2 x i64> %a @@ -187,25 +187,23 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v9 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v9 ; RV64-NEXT: fcvt.l.s a1, fa5 +; RV64-NEXT: vfmv.f.s fa5, v10 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vfmv.f.s fa5, v9 +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: ret %a = call <3 x i64> @llvm.llrint.v3i64.v3f32(<3 x float> %x) ret <3 x i64> %a @@ -298,25 +296,23 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v9 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v9 ; RV64-NEXT: fcvt.l.s a1, fa5 +; RV64-NEXT: vfmv.f.s fa5, v10 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vfmv.f.s fa5, v9 +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: ret %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x) ret <4 x i64> %a @@ -427,37 +423,37 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) { ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vslidedown.vi v10, v8, 7 +; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a1, fa5 ; RV64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-NEXT: fcvt.l.s a1, fa5 ; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a2, fa5 ; RV64-NEXT: vslidedown.vi v10, v8, 5 +; RV64-NEXT: fcvt.l.s a2, fa5 ; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a3, fa5 ; RV64-NEXT: vslidedown.vi v10, v8, 4 +; RV64-NEXT: fcvt.l.s a3, fa5 ; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: fcvt.l.s a4, fa5 +; RV64-NEXT: vfmv.f.s fa5, v9 +; RV64-NEXT: fcvt.l.s a5, fa5 +; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: fcvt.l.s a6, fa5 +; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: sd a4, 32(sp) ; RV64-NEXT: sd a3, 40(sp) ; RV64-NEXT: sd a2, 48(sp) ; RV64-NEXT: sd a1, 56(sp) -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 3 -; RV64-NEXT: vfmv.f.s fa5, v9 ; RV64-NEXT: fcvt.l.s a1, fa5 -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vfmv.f.s fa5, v9 -; RV64-NEXT: fcvt.l.s a2, fa5 -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a3, fa5 ; RV64-NEXT: sd a0, 0(sp) -; RV64-NEXT: sd a3, 8(sp) -; RV64-NEXT: sd a2, 16(sp) -; RV64-NEXT: sd a1, 24(sp) +; RV64-NEXT: sd a1, 8(sp) +; RV64-NEXT: sd a6, 16(sp) +; RV64-NEXT: sd a5, 24(sp) ; RV64-NEXT: mv a0, sp ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vle64.v v8, (a0) @@ -619,62 +615,62 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: flw fa5, 124(sp) +; RV64-NEXT: vfmv.f.s fa4, v8 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v10, v8, 3 +; RV64-NEXT: vslidedown.vi v11, v8, 2 ; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: sd a0, 248(sp) ; RV64-NEXT: flw fa5, 120(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 240(sp) +; RV64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-NEXT: fcvt.l.s a0, fa4 +; RV64-NEXT: vfmv.f.s fa4, v10 +; RV64-NEXT: fcvt.l.s a1, fa5 +; RV64-NEXT: sd a1, 240(sp) ; RV64-NEXT: flw fa5, 116(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 232(sp) +; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v14, v8, 7 +; RV64-NEXT: fcvt.l.s a1, fa4 +; RV64-NEXT: vfmv.f.s fa4, v11 +; RV64-NEXT: fcvt.l.s a2, fa5 +; RV64-NEXT: sd a2, 232(sp) ; RV64-NEXT: flw fa5, 112(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 224(sp) +; RV64-NEXT: fcvt.l.s a2, fa4 +; RV64-NEXT: vfmv.f.s fa4, v12 +; RV64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-NEXT: fcvt.l.s a3, fa5 +; RV64-NEXT: sd a3, 224(sp) ; RV64-NEXT: flw fa5, 108(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 216(sp) +; RV64-NEXT: fcvt.l.s a3, fa4 +; RV64-NEXT: vfmv.f.s fa4, v14 +; RV64-NEXT: vslidedown.vi v12, v8, 5 +; RV64-NEXT: fcvt.l.s a4, fa5 +; RV64-NEXT: sd a4, 216(sp) ; RV64-NEXT: flw fa5, 104(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 208(sp) +; RV64-NEXT: fcvt.l.s a4, fa4 +; RV64-NEXT: vfmv.f.s fa4, v10 +; RV64-NEXT: fcvt.l.s a5, fa4 +; RV64-NEXT: fcvt.l.s a6, fa5 +; RV64-NEXT: sd a6, 208(sp) ; RV64-NEXT: flw fa5, 100(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 200(sp) -; RV64-NEXT: flw fa5, 96(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 192(sp) +; RV64-NEXT: vfmv.f.s fa4, v12 +; RV64-NEXT: fcvt.l.s a6, fa4 +; RV64-NEXT: vslidedown.vi v8, v8, 4 +; RV64-NEXT: fcvt.l.s a7, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a1, fa5 -; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a2, fa5 -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a3, fa5 +; RV64-NEXT: sd a7, 200(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: flw fa5, 96(sp) ; RV64-NEXT: sd a0, 128(sp) ; RV64-NEXT: sd a3, 136(sp) ; RV64-NEXT: sd a2, 144(sp) ; RV64-NEXT: sd a1, 152(sp) -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 7 -; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: sd a7, 160(sp) +; RV64-NEXT: sd a6, 168(sp) +; RV64-NEXT: sd a5, 176(sp) +; RV64-NEXT: sd a4, 184(sp) ; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: vslidedown.vi v10, v8, 6 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a1, fa5 -; RV64-NEXT: vslidedown.vi v10, v8, 5 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.s a2, fa5 -; RV64-NEXT: vslidedown.vi v8, v8, 4 -; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.s a3, fa5 -; RV64-NEXT: sd a3, 160(sp) -; RV64-NEXT: sd a2, 168(sp) -; RV64-NEXT: sd a1, 176(sp) -; RV64-NEXT: sd a0, 184(sp) +; RV64-NEXT: sd a0, 192(sp) ; RV64-NEXT: addi a0, sp, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) @@ -775,12 +771,12 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v9 -; RV64-NEXT: fcvt.l.d a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v9 ; RV64-NEXT: fcvt.l.d a1, fa5 -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: ret %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x) ret <2 x i64> %a @@ -871,21 +867,22 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v10 -; RV64-NEXT: fcvt.l.d a0, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.d a1, fa5 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: fcvt.l.d a1, fa5 ; RV64-NEXT: vfmv.f.s fa5, v12 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: ret %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x) ret <4 x i64> %a @@ -987,34 +984,34 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: fld fa5, 56(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: sd a0, 120(sp) -; RV64-NEXT: fld fa5, 48(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: sd a0, 112(sp) -; RV64-NEXT: fld fa5, 40(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: sd a0, 104(sp) -; RV64-NEXT: fld fa5, 32(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 -; RV64-NEXT: sd a0, 96(sp) -; RV64-NEXT: vfmv.f.s fa5, v8 -; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: vfmv.f.s fa4, v8 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: fcvt.l.d a0, fa4 ; RV64-NEXT: fcvt.l.d a1, fa5 +; RV64-NEXT: sd a1, 120(sp) +; RV64-NEXT: fld fa5, 48(sp) +; RV64-NEXT: vfmv.f.s fa4, v10 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vfmv.f.s fa5, v10 +; RV64-NEXT: fcvt.l.d a1, fa4 ; RV64-NEXT: fcvt.l.d a2, fa5 +; RV64-NEXT: sd a2, 112(sp) +; RV64-NEXT: fld fa5, 40(sp) +; RV64-NEXT: vfmv.f.s fa4, v10 +; RV64-NEXT: fcvt.l.d a2, fa4 ; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: fcvt.l.d a3, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: sd a3, 104(sp) ; RV64-NEXT: fcvt.l.d a3, fa5 +; RV64-NEXT: fld fa5, 32(sp) ; RV64-NEXT: sd a0, 64(sp) ; RV64-NEXT: sd a1, 72(sp) ; RV64-NEXT: sd a3, 80(sp) ; RV64-NEXT: sd a2, 88(sp) +; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: sd a0, 96(sp) ; RV64-NEXT: addi a0, sp, 64 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vle64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll index 8f1e026d09c0a..356bc5edd77a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll @@ -88,14 +88,14 @@ define <6 x i1> @load_v6i1(ptr %p) { ; RV32-NEXT: lbu a0, 0(a0) ; RV32-NEXT: srli a1, a0, 5 ; RV32-NEXT: slli a2, a0, 27 -; RV32-NEXT: srli a2, a2, 31 ; RV32-NEXT: slli a3, a0, 28 -; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: slli a4, a0, 29 -; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: slli a5, a0, 30 -; RV32-NEXT: srli a5, a5, 31 ; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: srli a3, a3, 31 +; RV32-NEXT: srli a4, a4, 31 +; RV32-NEXT: srli a5, a5, 31 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 @@ -113,14 +113,14 @@ define <6 x i1> @load_v6i1(ptr %p) { ; RV64-NEXT: lbu a0, 0(a0) ; RV64-NEXT: srli a1, a0, 5 ; RV64-NEXT: slli a2, a0, 59 -; RV64-NEXT: srli a2, a2, 63 ; RV64-NEXT: slli a3, a0, 60 -; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: slli a4, a0, 61 -; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: slli a5, a0, 62 -; RV64-NEXT: srli a5, a5, 63 ; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: srli a3, a3, 63 +; RV64-NEXT: srli a4, a4, 63 +; RV64-NEXT: srli a5, a5, 63 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vslide1down.vx v8, v8, a5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll index 0e98fd1ab0f5d..2f58e3dd2769f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll @@ -41,37 +41,37 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v9 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: fcvt.w.s a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v9 ; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v2f32: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-i32-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v9 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v9 ; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: vmv.v.x v8, a1 -; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: vmv.v.x v8, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a1 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v2f32: ; RV64-i64: # %bb.0: ; RV64-i64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-i64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v9 ; RV64-i64-NEXT: fcvt.l.s a1, fa5 ; RV64-i64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-i64-NEXT: vmv.v.x v8, a1 -; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i64-NEXT: vmv.v.x v8, a0 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-i64-NEXT: ret %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x) ret <2 x iXLen> %a @@ -83,65 +83,63 @@ define <3 x iXLen> @lrint_v3f32(<3 x float> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v9 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: fcvt.w.s a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v9 +; RV32-NEXT: fcvt.w.s a1, fa5 ; RV32-NEXT: vfmv.f.s fa5, v10 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v9, v9, a0 -; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: vslide1down.vx v8, v9, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v8, v9, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v3f32: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-i32-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v9 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: vmv.v.x v9, a1 -; RV64-i32-NEXT: vslide1down.vx v9, v9, a0 ; RV64-i32-NEXT: vslidedown.vi v10, v8, 2 +; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v9 +; RV64-i32-NEXT: fcvt.l.s a1, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v10 +; RV64-i32-NEXT: vmv.v.x v9, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v9, v9, a0 -; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: vslide1down.vx v8, v9, a1 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v8, v9, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v3f32: ; RV64-i64: # %bb.0: ; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-i64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v9 ; RV64-i64-NEXT: fcvt.l.s a1, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v10 ; RV64-i64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-i64-NEXT: vmv.v.x v10, a1 -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 +; RV64-i64-NEXT: vmv.v.x v10, a0 ; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-i64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-i64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i64-NEXT: ret %a = call <3 x iXLen> @llvm.lrint.v3iXLen.v3f32(<3 x float> %x) ret <3 x iXLen> %a @@ -153,65 +151,63 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v9 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: fcvt.w.s a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v9 +; RV32-NEXT: fcvt.w.s a1, fa5 ; RV32-NEXT: vfmv.f.s fa5, v10 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v9, v9, a0 -; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: vslide1down.vx v8, v9, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v8, v9, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v4f32: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-i32-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v9 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: vmv.v.x v9, a1 -; RV64-i32-NEXT: vslide1down.vx v9, v9, a0 ; RV64-i32-NEXT: vslidedown.vi v10, v8, 2 +; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v9 +; RV64-i32-NEXT: fcvt.l.s a1, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v10 +; RV64-i32-NEXT: vmv.v.x v9, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v9, v9, a0 -; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: vslide1down.vx v8, v9, a1 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v8, v9, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v4f32: ; RV64-i64: # %bb.0: ; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-i64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v9 ; RV64-i64-NEXT: fcvt.l.s a1, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v10 ; RV64-i64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-i64-NEXT: vmv.v.x v10, a1 -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 +; RV64-i64-NEXT: vmv.v.x v10, a0 ; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-i64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-i64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i64-NEXT: ret %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x) ret <4 x iXLen> %a @@ -223,82 +219,74 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: vslidedown.vi v11, v8, 2 +; RV32-NEXT: vslidedown.vi v12, v8, 3 +; RV32-NEXT: fcvt.w.s a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v10 ; RV32-NEXT: fcvt.w.s a1, fa5 +; RV32-NEXT: vfmv.f.s fa5, v11 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 2 -; RV32-NEXT: vfmv.f.s fa5, v12 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 3 ; RV32-NEXT: vfmv.f.s fa5, v12 -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vslidedown.vi v12, v8, 4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: fcvt.w.s a1, fa5 ; RV32-NEXT: vfmv.f.s fa5, v12 -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vslidedown.vi v12, v8, 5 -; RV32-NEXT: vfmv.f.s fa5, v12 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vslide1down.vx v10, v10, a0 +; RV32-NEXT: fcvt.w.s a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 6 +; RV32-NEXT: vslidedown.vi v8, v8, 7 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: fcvt.w.s a1, fa5 ; RV32-NEXT: vfmv.f.s fa5, v12 -; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vslidedown.vi v8, v8, 7 +; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: vslide1down.vx v8, v10, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v8f32: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: vslidedown.vi v11, v8, 2 +; RV64-i32-NEXT: vslidedown.vi v12, v8, 3 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v10 ; RV64-i32-NEXT: fcvt.l.s a1, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v11 ; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-i32-NEXT: vmv.v.x v10, a1 -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i32-NEXT: vslidedown.vi v12, v8, 2 -; RV64-i32-NEXT: vfmv.f.s fa5, v12 +; RV64-i32-NEXT: vmv.v.x v10, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i32-NEXT: vslidedown.vi v12, v8, 3 ; RV64-i32-NEXT: vfmv.f.s fa5, v12 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 ; RV64-i32-NEXT: vslidedown.vi v12, v8, 4 +; RV64-i32-NEXT: vslide1down.vx v10, v10, a1 +; RV64-i32-NEXT: fcvt.l.s a1, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v12 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 ; RV64-i32-NEXT: vslidedown.vi v12, v8, 5 -; RV64-i32-NEXT: vfmv.f.s fa5, v12 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v12 ; RV64-i32-NEXT: vslidedown.vi v12, v8, 6 +; RV64-i32-NEXT: vslidedown.vi v8, v8, 7 +; RV64-i32-NEXT: vslide1down.vx v10, v10, a1 +; RV64-i32-NEXT: fcvt.l.s a1, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v12 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i32-NEXT: vslidedown.vi v8, v8, 7 +; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: vslide1down.vx v8, v10, a1 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v8f32: @@ -314,37 +302,37 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) { ; RV64-i64-NEXT: andi sp, sp, -64 ; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vslidedown.vi v10, v8, 7 +; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a1, fa5 ; RV64-i64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-i64-NEXT: fcvt.l.s a1, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a2, fa5 ; RV64-i64-NEXT: vslidedown.vi v10, v8, 5 +; RV64-i64-NEXT: fcvt.l.s a2, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a3, fa5 ; RV64-i64-NEXT: vslidedown.vi v10, v8, 4 +; RV64-i64-NEXT: fcvt.l.s a3, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-i64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-i64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-i64-NEXT: fcvt.l.s a4, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v9 +; RV64-i64-NEXT: fcvt.l.s a5, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: fcvt.l.s a6, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v8 ; RV64-i64-NEXT: sd a4, 32(sp) ; RV64-i64-NEXT: sd a3, 40(sp) ; RV64-i64-NEXT: sd a2, 48(sp) ; RV64-i64-NEXT: sd a1, 56(sp) -; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v9, v8, 3 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 ; RV64-i64-NEXT: fcvt.l.s a1, fa5 -; RV64-i64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 -; RV64-i64-NEXT: fcvt.l.s a2, fa5 -; RV64-i64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a3, fa5 ; RV64-i64-NEXT: sd a0, 0(sp) -; RV64-i64-NEXT: sd a3, 8(sp) -; RV64-i64-NEXT: sd a2, 16(sp) -; RV64-i64-NEXT: sd a1, 24(sp) +; RV64-i64-NEXT: sd a1, 8(sp) +; RV64-i64-NEXT: sd a6, 16(sp) +; RV64-i64-NEXT: sd a5, 24(sp) ; RV64-i64-NEXT: mv a0, sp ; RV64-i64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-i64-NEXT: vle64.v v8, (a0) @@ -378,62 +366,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: flw fa5, 60(sp) +; RV32-NEXT: vfmv.f.s fa4, v8 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vslidedown.vi v11, v8, 2 ; RV32-NEXT: fcvt.w.s a0, fa5 ; RV32-NEXT: sw a0, 124(sp) ; RV32-NEXT: flw fa5, 56(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 120(sp) +; RV32-NEXT: fcvt.w.s a0, fa4 +; RV32-NEXT: vfmv.f.s fa4, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: fcvt.w.s a1, fa5 +; RV32-NEXT: sw a1, 120(sp) ; RV32-NEXT: flw fa5, 52(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 116(sp) +; RV32-NEXT: fcvt.w.s a1, fa4 +; RV32-NEXT: vfmv.f.s fa4, v11 +; RV32-NEXT: fcvt.w.s a2, fa4 +; RV32-NEXT: fcvt.w.s a3, fa5 +; RV32-NEXT: sw a3, 116(sp) ; RV32-NEXT: flw fa5, 48(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 112(sp) +; RV32-NEXT: vfmv.f.s fa4, v10 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v10, v8, 7 +; RV32-NEXT: fcvt.w.s a3, fa4 +; RV32-NEXT: fcvt.w.s a4, fa5 +; RV32-NEXT: sw a4, 112(sp) ; RV32-NEXT: flw fa5, 44(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 108(sp) +; RV32-NEXT: vfmv.f.s fa4, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 6 +; RV32-NEXT: fcvt.w.s a4, fa4 +; RV32-NEXT: fcvt.w.s a5, fa5 +; RV32-NEXT: sw a5, 108(sp) ; RV32-NEXT: flw fa5, 40(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 104(sp) +; RV32-NEXT: vfmv.f.s fa4, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 5 +; RV32-NEXT: fcvt.w.s a5, fa4 +; RV32-NEXT: fcvt.w.s a6, fa5 +; RV32-NEXT: sw a6, 104(sp) ; RV32-NEXT: flw fa5, 36(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 100(sp) -; RV32-NEXT: flw fa5, 32(sp) -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 96(sp) +; RV32-NEXT: vfmv.f.s fa4, v10 +; RV32-NEXT: fcvt.w.s a6, fa4 +; RV32-NEXT: vslidedown.vi v8, v8, 4 +; RV32-NEXT: fcvt.w.s a7, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a2, fa5 -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a3, fa5 +; RV32-NEXT: sw a7, 100(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: flw fa5, 32(sp) ; RV32-NEXT: sw a0, 64(sp) ; RV32-NEXT: sw a3, 68(sp) ; RV32-NEXT: sw a2, 72(sp) ; RV32-NEXT: sw a1, 76(sp) -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 7 -; RV32-NEXT: vfmv.f.s fa5, v10 +; RV32-NEXT: sw a7, 80(sp) +; RV32-NEXT: sw a6, 84(sp) +; RV32-NEXT: sw a5, 88(sp) +; RV32-NEXT: sw a4, 92(sp) ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: vslidedown.vi v10, v8, 6 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.s a2, fa5 -; RV32-NEXT: vslidedown.vi v8, v8, 4 -; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.s a3, fa5 -; RV32-NEXT: sw a3, 80(sp) -; RV32-NEXT: sw a2, 84(sp) -; RV32-NEXT: sw a1, 88(sp) -; RV32-NEXT: sw a0, 92(sp) +; RV32-NEXT: sw a0, 96(sp) ; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) @@ -462,62 +450,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV64-i32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-i32-NEXT: vse32.v v8, (a0) ; RV64-i32-NEXT: flw fa5, 60(sp) +; RV64-i32-NEXT: vfmv.f.s fa4, v8 +; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-i32-NEXT: vslidedown.vi v10, v8, 3 +; RV64-i32-NEXT: vslidedown.vi v11, v8, 2 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 ; RV64-i32-NEXT: sw a0, 124(sp) ; RV64-i32-NEXT: flw fa5, 56(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 120(sp) +; RV64-i32-NEXT: fcvt.l.s a0, fa4 +; RV64-i32-NEXT: vfmv.f.s fa4, v10 +; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 +; RV64-i32-NEXT: fcvt.l.s a1, fa5 +; RV64-i32-NEXT: sw a1, 120(sp) ; RV64-i32-NEXT: flw fa5, 52(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 116(sp) +; RV64-i32-NEXT: fcvt.l.s a1, fa4 +; RV64-i32-NEXT: vfmv.f.s fa4, v11 +; RV64-i32-NEXT: fcvt.l.s a2, fa4 +; RV64-i32-NEXT: fcvt.l.s a3, fa5 +; RV64-i32-NEXT: sw a3, 116(sp) ; RV64-i32-NEXT: flw fa5, 48(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 112(sp) +; RV64-i32-NEXT: vfmv.f.s fa4, v10 +; RV64-i32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i32-NEXT: vslidedown.vi v10, v8, 7 +; RV64-i32-NEXT: fcvt.l.s a3, fa4 +; RV64-i32-NEXT: fcvt.l.s a4, fa5 +; RV64-i32-NEXT: sw a4, 112(sp) ; RV64-i32-NEXT: flw fa5, 44(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 108(sp) +; RV64-i32-NEXT: vfmv.f.s fa4, v10 +; RV64-i32-NEXT: vslidedown.vi v10, v8, 6 +; RV64-i32-NEXT: fcvt.l.s a4, fa4 +; RV64-i32-NEXT: fcvt.l.s a5, fa5 +; RV64-i32-NEXT: sw a5, 108(sp) ; RV64-i32-NEXT: flw fa5, 40(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 104(sp) +; RV64-i32-NEXT: vfmv.f.s fa4, v10 +; RV64-i32-NEXT: vslidedown.vi v10, v8, 5 +; RV64-i32-NEXT: fcvt.l.s a5, fa4 +; RV64-i32-NEXT: fcvt.l.s a6, fa5 +; RV64-i32-NEXT: sw a6, 104(sp) ; RV64-i32-NEXT: flw fa5, 36(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 100(sp) -; RV64-i32-NEXT: flw fa5, 32(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 96(sp) +; RV64-i32-NEXT: vfmv.f.s fa4, v10 +; RV64-i32-NEXT: fcvt.l.s a6, fa4 +; RV64-i32-NEXT: vslidedown.vi v8, v8, 4 +; RV64-i32-NEXT: fcvt.l.s a7, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 2 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a2, fa5 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a3, fa5 +; RV64-i32-NEXT: sw a7, 100(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: flw fa5, 32(sp) ; RV64-i32-NEXT: sw a0, 64(sp) ; RV64-i32-NEXT: sw a3, 68(sp) ; RV64-i32-NEXT: sw a2, 72(sp) ; RV64-i32-NEXT: sw a1, 76(sp) -; RV64-i32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 7 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 +; RV64-i32-NEXT: sw a7, 80(sp) +; RV64-i32-NEXT: sw a6, 84(sp) +; RV64-i32-NEXT: sw a5, 88(sp) +; RV64-i32-NEXT: sw a4, 92(sp) ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 6 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 5 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.s a2, fa5 -; RV64-i32-NEXT: vslidedown.vi v8, v8, 4 -; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.s a3, fa5 -; RV64-i32-NEXT: sw a3, 80(sp) -; RV64-i32-NEXT: sw a2, 84(sp) -; RV64-i32-NEXT: sw a1, 88(sp) -; RV64-i32-NEXT: sw a0, 92(sp) +; RV64-i32-NEXT: sw a0, 96(sp) ; RV64-i32-NEXT: addi a0, sp, 64 ; RV64-i32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-i32-NEXT: vle32.v v8, (a0) @@ -546,62 +534,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV64-i64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-i64-NEXT: vse32.v v8, (a0) ; RV64-i64-NEXT: flw fa5, 124(sp) +; RV64-i64-NEXT: vfmv.f.s fa4, v8 +; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-i64-NEXT: vslidedown.vi v10, v8, 3 +; RV64-i64-NEXT: vslidedown.vi v11, v8, 2 ; RV64-i64-NEXT: fcvt.l.s a0, fa5 ; RV64-i64-NEXT: sd a0, 248(sp) ; RV64-i64-NEXT: flw fa5, 120(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 240(sp) +; RV64-i64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-i64-NEXT: fcvt.l.s a0, fa4 +; RV64-i64-NEXT: vfmv.f.s fa4, v10 +; RV64-i64-NEXT: fcvt.l.s a1, fa5 +; RV64-i64-NEXT: sd a1, 240(sp) ; RV64-i64-NEXT: flw fa5, 116(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 232(sp) +; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i64-NEXT: vslidedown.vi v14, v8, 7 +; RV64-i64-NEXT: fcvt.l.s a1, fa4 +; RV64-i64-NEXT: vfmv.f.s fa4, v11 +; RV64-i64-NEXT: fcvt.l.s a2, fa5 +; RV64-i64-NEXT: sd a2, 232(sp) ; RV64-i64-NEXT: flw fa5, 112(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 224(sp) +; RV64-i64-NEXT: fcvt.l.s a2, fa4 +; RV64-i64-NEXT: vfmv.f.s fa4, v12 +; RV64-i64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-i64-NEXT: fcvt.l.s a3, fa5 +; RV64-i64-NEXT: sd a3, 224(sp) ; RV64-i64-NEXT: flw fa5, 108(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 216(sp) +; RV64-i64-NEXT: fcvt.l.s a3, fa4 +; RV64-i64-NEXT: vfmv.f.s fa4, v14 +; RV64-i64-NEXT: vslidedown.vi v12, v8, 5 +; RV64-i64-NEXT: fcvt.l.s a4, fa5 +; RV64-i64-NEXT: sd a4, 216(sp) ; RV64-i64-NEXT: flw fa5, 104(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 208(sp) +; RV64-i64-NEXT: fcvt.l.s a4, fa4 +; RV64-i64-NEXT: vfmv.f.s fa4, v10 +; RV64-i64-NEXT: fcvt.l.s a5, fa4 +; RV64-i64-NEXT: fcvt.l.s a6, fa5 +; RV64-i64-NEXT: sd a6, 208(sp) ; RV64-i64-NEXT: flw fa5, 100(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 200(sp) -; RV64-i64-NEXT: flw fa5, 96(sp) -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 192(sp) +; RV64-i64-NEXT: vfmv.f.s fa4, v12 +; RV64-i64-NEXT: fcvt.l.s a6, fa4 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 4 +; RV64-i64-NEXT: fcvt.l.s a7, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a1, fa5 -; RV64-i64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a2, fa5 -; RV64-i64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a3, fa5 +; RV64-i64-NEXT: sd a7, 200(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: flw fa5, 96(sp) ; RV64-i64-NEXT: sd a0, 128(sp) ; RV64-i64-NEXT: sd a3, 136(sp) ; RV64-i64-NEXT: sd a2, 144(sp) ; RV64-i64-NEXT: sd a1, 152(sp) -; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-i64-NEXT: vslidedown.vi v10, v8, 7 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: sd a7, 160(sp) +; RV64-i64-NEXT: sd a6, 168(sp) +; RV64-i64-NEXT: sd a5, 176(sp) +; RV64-i64-NEXT: sd a4, 184(sp) ; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: vslidedown.vi v10, v8, 6 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a1, fa5 -; RV64-i64-NEXT: vslidedown.vi v10, v8, 5 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.s a2, fa5 -; RV64-i64-NEXT: vslidedown.vi v8, v8, 4 -; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.s a3, fa5 -; RV64-i64-NEXT: sd a3, 160(sp) -; RV64-i64-NEXT: sd a2, 168(sp) -; RV64-i64-NEXT: sd a1, 176(sp) -; RV64-i64-NEXT: sd a0, 184(sp) +; RV64-i64-NEXT: sd a0, 192(sp) ; RV64-i64-NEXT: addi a0, sp, 128 ; RV64-i64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-i64-NEXT: vle64.v v8, (a0) @@ -653,38 +641,38 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v9 -; RV32-NEXT: fcvt.w.d a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v9 ; RV32-NEXT: fcvt.w.d a1, fa5 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v2f64: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-i32-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v9 -; RV64-i32-NEXT: fcvt.l.d a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v9 ; RV64-i32-NEXT: fcvt.l.d a1, fa5 ; RV64-i32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-i32-NEXT: vmv.v.x v8, a1 -; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: vmv.v.x v8, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a1 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v2f64: ; RV64-i64: # %bb.0: ; RV64-i64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-i64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v9 -; RV64-i64-NEXT: fcvt.l.d a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v9 ; RV64-i64-NEXT: fcvt.l.d a1, fa5 -; RV64-i64-NEXT: vmv.v.x v8, a1 -; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i64-NEXT: vmv.v.x v8, a0 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-i64-NEXT: ret %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x) ret <2 x iXLen> %a @@ -696,71 +684,70 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.d a0, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.d a1, fa5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v10 +; RV32-NEXT: fcvt.w.d a1, fa5 ; RV32-NEXT: vfmv.f.s fa5, v12 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: fcvt.w.d a0, fa5 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.d a0, fa5 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: vslide1down.vx v8, v9, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-i32-LABEL: lrint_v4f64: ; RV64-i32: # %bb.0: ; RV64-i32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.d a0, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.d a1, fa5 -; RV64-i32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-i32-NEXT: vmv.v.x v10, a1 -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-i32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-i32-NEXT: vslidedown.vi v12, v8, 2 +; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v10 +; RV64-i32-NEXT: fcvt.l.d a1, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v12 +; RV64-i32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-i32-NEXT: vmv.v.x v9, a0 ; RV64-i32-NEXT: fcvt.l.d a0, fa5 -; RV64-i32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i32-NEXT: vslide1down.vx v10, v10, a0 ; RV64-i32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.d a0, fa5 ; RV64-i32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-i32-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v9, a1 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: ret ; ; RV64-i64-LABEL: lrint_v4f64: ; RV64-i64: # %bb.0: ; RV64-i64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-i64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 -; RV64-i64-NEXT: fcvt.l.d a0, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.d a1, fa5 -; RV64-i64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-i64-NEXT: vmv.v.x v10, a1 -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 +; RV64-i64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-i64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: fcvt.l.d a1, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v12 +; RV64-i64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-i64-NEXT: vmv.v.x v10, a0 ; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-i64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: vslide1down.vx v8, v10, a1 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i64-NEXT: ret %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x) ret <4 x iXLen> %a @@ -780,21 +767,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: vslidedown.vi v12, v8, 1 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.d a1, fa5 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vfmv.f.s fa5, v10 -; RV32-NEXT: fcvt.w.d a2, fa5 +; RV32-NEXT: vslidedown.vi v14, v8, 2 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: fcvt.w.d a3, fa5 +; RV32-NEXT: vfmv.f.s fa4, v12 +; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: vfmv.f.s fa5, v14 +; RV32-NEXT: vfmv.f.s fa3, v8 +; RV32-NEXT: fcvt.w.d a1, fa4 +; RV32-NEXT: fcvt.w.d a2, fa5 +; RV32-NEXT: fcvt.w.d a3, fa3 ; RV32-NEXT: fld fa5, 32(sp) ; RV32-NEXT: fld fa4, 40(sp) ; RV32-NEXT: fld fa3, 48(sp) @@ -803,8 +791,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV32-NEXT: fcvt.w.d a5, fa4 ; RV32-NEXT: fcvt.w.d a6, fa3 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: vslide1down.vx v8, v8, a4 @@ -834,21 +822,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i32-NEXT: .cfi_def_cfa s0, 0 ; RV64-i32-NEXT: andi sp, sp, -64 ; RV64-i32-NEXT: mv a0, sp -; RV64-i32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-i32-NEXT: vse64.v v8, (a0) ; RV64-i32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: vslidedown.vi v12, v8, 1 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.d a1, fa5 ; RV64-i32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 2 -; RV64-i32-NEXT: vfmv.f.s fa5, v10 -; RV64-i32-NEXT: fcvt.l.d a2, fa5 +; RV64-i32-NEXT: vslidedown.vi v14, v8, 2 +; RV64-i32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-i32-NEXT: vse64.v v8, (a0) +; RV64-i32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 -; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: fcvt.l.d a3, fa5 +; RV64-i32-NEXT: vfmv.f.s fa4, v12 +; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v14 +; RV64-i32-NEXT: vfmv.f.s fa3, v8 +; RV64-i32-NEXT: fcvt.l.d a1, fa4 +; RV64-i32-NEXT: fcvt.l.d a2, fa5 +; RV64-i32-NEXT: fcvt.l.d a3, fa3 ; RV64-i32-NEXT: fld fa5, 32(sp) ; RV64-i32-NEXT: fld fa4, 40(sp) ; RV64-i32-NEXT: fld fa3, 48(sp) @@ -857,8 +846,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i32-NEXT: fcvt.l.d a5, fa4 ; RV64-i32-NEXT: fcvt.l.d a6, fa3 ; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-i32-NEXT: vmv.v.x v8, a1 -; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: vmv.v.x v8, a0 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a1 ; RV64-i32-NEXT: vslide1down.vx v8, v8, a2 ; RV64-i32-NEXT: vslide1down.vx v8, v8, a3 ; RV64-i32-NEXT: vslide1down.vx v8, v8, a4 @@ -891,34 +880,34 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-i64-NEXT: vse64.v v8, (a0) ; RV64-i64-NEXT: fld fa5, 56(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: sd a0, 120(sp) -; RV64-i64-NEXT: fld fa5, 48(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: sd a0, 112(sp) -; RV64-i64-NEXT: fld fa5, 40(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: sd a0, 104(sp) -; RV64-i64-NEXT: fld fa5, 32(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 -; RV64-i64-NEXT: sd a0, 96(sp) -; RV64-i64-NEXT: vfmv.f.s fa5, v8 -; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa4, v8 ; RV64-i64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-i64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: fcvt.l.d a0, fa4 ; RV64-i64-NEXT: fcvt.l.d a1, fa5 +; RV64-i64-NEXT: sd a1, 120(sp) +; RV64-i64-NEXT: fld fa5, 48(sp) +; RV64-i64-NEXT: vfmv.f.s fa4, v10 ; RV64-i64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-i64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i64-NEXT: vfmv.f.s fa5, v10 +; RV64-i64-NEXT: fcvt.l.d a1, fa4 ; RV64-i64-NEXT: fcvt.l.d a2, fa5 +; RV64-i64-NEXT: sd a2, 112(sp) +; RV64-i64-NEXT: fld fa5, 40(sp) +; RV64-i64-NEXT: vfmv.f.s fa4, v10 +; RV64-i64-NEXT: fcvt.l.d a2, fa4 ; RV64-i64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-i64-NEXT: fcvt.l.d a3, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: sd a3, 104(sp) ; RV64-i64-NEXT: fcvt.l.d a3, fa5 +; RV64-i64-NEXT: fld fa5, 32(sp) ; RV64-i64-NEXT: sd a0, 64(sp) ; RV64-i64-NEXT: sd a1, 72(sp) ; RV64-i64-NEXT: sd a3, 80(sp) ; RV64-i64-NEXT: sd a2, 88(sp) +; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: sd a0, 96(sp) ; RV64-i64-NEXT: addi a0, sp, 64 ; RV64-i64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-i64-NEXT: vle64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index 5b9af1a3cfe23..c29ccd45528b8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -282,11 +282,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vslide1down.vx v9, v9, a0 -; CHECK-NEXT: vslide1down.vx v9, v9, a1 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 ; CHECK-NEXT: vslide1down.vx v8, v8, zero -; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 @@ -299,11 +299,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; ZVE32F-NEXT: vmv.v.x v8, a0 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 ; ZVE32F-NEXT: li a0, 1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero -; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 @@ -327,11 +327,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vslide1down.vx v9, v9, a0 -; CHECK-NEXT: vslide1down.vx v9, v9, a1 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 ; CHECK-NEXT: vslide1down.vx v8, v8, zero -; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 @@ -344,11 +344,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; ZVE32F-NEXT: vmv.v.x v8, a0 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 ; ZVE32F-NEXT: li a0, 1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero -; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 @@ -370,12 +370,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 -; CHECK-NEXT: vslide1down.vx v9, v9, a1 -; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 @@ -386,12 +386,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index bb4589a46bf41..6cc3f7e76797b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -345,14 +345,14 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: srai a2, a1, 31 -; RV32ZVE32F-NEXT: vmv.x.s a3, v9 -; RV32ZVE32F-NEXT: srai a4, a3, 31 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) -; RV32ZVE32F-NEXT: sw a1, 8(a0) -; RV32ZVE32F-NEXT: sw a2, 12(a0) +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: srai a3, a1, 31 +; RV32ZVE32F-NEXT: srai a4, a2, 31 +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64: @@ -406,15 +406,15 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: sw zero, 12(a0) ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: andi a1, a1, 255 -; RV32ZVE32F-NEXT: vmv.x.s a2, v9 ; RV32ZVE32F-NEXT: andi a2, a2, 255 -; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) ; RV32ZVE32F-NEXT: sw zero, 4(a0) -; RV32ZVE32F-NEXT: sw a1, 8(a0) -; RV32ZVE32F-NEXT: sw zero, 12(a0) +; RV32ZVE32F-NEXT: sw a2, 8(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: @@ -439,8 +439,8 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: .LBB7_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: ret @@ -1038,14 +1038,14 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: srai a2, a1, 31 -; RV32ZVE32F-NEXT: vmv.x.s a3, v9 -; RV32ZVE32F-NEXT: srai a4, a3, 31 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) -; RV32ZVE32F-NEXT: sw a1, 8(a0) -; RV32ZVE32F-NEXT: sw a2, 12(a0) +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: srai a3, a1, 31 +; RV32ZVE32F-NEXT: srai a4, a2, 31 +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64: @@ -1097,15 +1097,15 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; ; RV32ZVE32F-LABEL: mgather_v2i16_zextload_v2i64: ; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: lui a1, 16 ; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: addi a1, a1, -1 ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lui a2, 16 -; RV32ZVE32F-NEXT: addi a2, a2, -1 -; RV32ZVE32F-NEXT: and a1, a1, a2 -; RV32ZVE32F-NEXT: vmv.x.s a3, v9 -; RV32ZVE32F-NEXT: and a2, a3, a2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: and a2, a2, a1 +; RV32ZVE32F-NEXT: and a1, a3, a1 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw zero, 4(a0) ; RV32ZVE32F-NEXT: sw a1, 8(a0) @@ -1135,10 +1135,10 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: lui a1, 16 -; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: and a0, a0, a1 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a0, a0, a1 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) @@ -2096,19 +2096,19 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; ; RV32ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: ; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi a1, a0, 8 ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: srai a1, a1, 31 ; RV32ZVE32F-NEXT: vmv.x.s a2, v9 -; RV32ZVE32F-NEXT: srai a2, a2, 31 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vse32.v v9, (a0) -; RV32ZVE32F-NEXT: addi a3, a0, 8 -; RV32ZVE32F-NEXT: vse32.v v8, (a3) +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: srai a2, a2, 31 +; RV32ZVE32F-NEXT: vse32.v v8, (a1) +; RV32ZVE32F-NEXT: srai a3, a3, 31 ; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a1, 12(a0) +; RV32ZVE32F-NEXT: sw a3, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: @@ -2160,15 +2160,15 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; ; RV32ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: ; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi a1, a0, 8 ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: sw zero, 4(a0) ; RV32ZVE32F-NEXT: sw zero, 12(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vse32.v v9, (a0) -; RV32ZVE32F-NEXT: addi a0, a0, 8 -; RV32ZVE32F-NEXT: vse32.v v8, (a0) +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vse32.v v8, (a1) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: @@ -2193,10 +2193,10 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 32 -; RV64ZVE32F-NEXT: srli a0, a0, 32 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: srli a0, a0, 32 ; RV64ZVE32F-NEXT: slli a1, a1, 32 ; RV64ZVE32F-NEXT: srli a1, a1, 32 ; RV64ZVE32F-NEXT: ret @@ -3776,28 +3776,28 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) { ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a2, 0(a1) -; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a3, v9 -; RV32ZVE32F-NEXT: lw a4, 0(a3) -; RV32ZVE32F-NEXT: lw a3, 4(a3) +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: lw a6, 0(a5) -; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a7, v8 -; RV32ZVE32F-NEXT: lw t0, 0(a7) -; RV32ZVE32F-NEXT: lw a7, 4(a7) -; RV32ZVE32F-NEXT: sw a6, 16(a0) -; RV32ZVE32F-NEXT: sw a5, 20(a0) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: vmv.x.s a4, v9 +; RV32ZVE32F-NEXT: vmv.x.s a5, v8 +; RV32ZVE32F-NEXT: lw a6, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) +; RV32ZVE32F-NEXT: lw a7, 0(a4) +; RV32ZVE32F-NEXT: lw a4, 4(a4) +; RV32ZVE32F-NEXT: lw t0, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a4, 20(a0) ; RV32ZVE32F-NEXT: sw t0, 24(a0) -; RV32ZVE32F-NEXT: sw a7, 28(a0) -; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 28(a0) +; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a4, 8(a0) -; RV32ZVE32F-NEXT: sw a3, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 8(a0) +; RV32ZVE32F-NEXT: sw a2, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i64: @@ -4132,11 +4132,11 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB48_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -4409,11 +4409,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB49_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -4688,11 +4688,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB50_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -4974,11 +4974,11 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB51_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -5252,11 +5252,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB52_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -5532,11 +5532,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB53_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -6666,6 +6666,9 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: lw a2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v8, t0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -6674,10 +6677,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 ; RV32ZVE32F-NEXT: andi a2, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load @@ -6813,89 +6813,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a4, a6, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: andi a4, a7, 1 ; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a4, 0(a2) ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a6, 2 +; RV64ZVE32F-NEXT: andi a5, a7, 2 ; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 ; RV64ZVE32F-NEXT: .LBB57_2: ; RV64ZVE32F-NEXT: ld a5, 8(a3) -; RV64ZVE32F-NEXT: andi a7, a6, 4 -; RV64ZVE32F-NEXT: bnez a7, .LBB57_11 +; RV64ZVE32F-NEXT: andi a6, a7, 4 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_11 ; RV64ZVE32F-NEXT: .LBB57_3: -; RV64ZVE32F-NEXT: ld a7, 16(a3) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: ld a6, 16(a3) +; RV64ZVE32F-NEXT: andi t0, a7, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 ; RV64ZVE32F-NEXT: .LBB57_4: ; RV64ZVE32F-NEXT: ld t0, 24(a3) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a7, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 ; RV64ZVE32F-NEXT: .LBB57_5: ; RV64ZVE32F-NEXT: ld t1, 32(a3) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a7, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 ; RV64ZVE32F-NEXT: .LBB57_6: ; RV64ZVE32F-NEXT: ld t2, 40(a3) -; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: andi t3, a7, 64 ; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 ; RV64ZVE32F-NEXT: .LBB57_7: ; RV64ZVE32F-NEXT: ld t3, 48(a3) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB57_16 +; RV64ZVE32F-NEXT: andi a7, a7, -128 +; RV64ZVE32F-NEXT: bnez a7, .LBB57_16 ; RV64ZVE32F-NEXT: .LBB57_8: ; RV64ZVE32F-NEXT: ld a1, 56(a3) ; RV64ZVE32F-NEXT: j .LBB57_17 ; RV64ZVE32F-NEXT: .LBB57_9: ; RV64ZVE32F-NEXT: ld a4, 0(a3) -; RV64ZVE32F-NEXT: andi a5, a6, 2 +; RV64ZVE32F-NEXT: andi a5, a7, 2 ; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: slli a5, a5, 3 ; RV64ZVE32F-NEXT: add a5, a1, a5 ; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a7, a6, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB57_3 +; RV64ZVE32F-NEXT: andi a6, a7, 4 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 -; RV64ZVE32F-NEXT: ld a7, 16(a2) -; RV64ZVE32F-NEXT: slli a7, a7, 3 -; RV64ZVE32F-NEXT: add a7, a1, a7 -; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi t0, a7, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: andi t1, a7, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t1, 32(a2) ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: andi t2, a7, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: andi t3, a7, 64 ; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB57_8 +; RV64ZVE32F-NEXT: andi a7, a7, -128 +; RV64ZVE32F-NEXT: beqz a7, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a2, 56(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -6904,7 +6904,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB57_17: # %else20 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: sd a5, 8(a0) -; RV64ZVE32F-NEXT: sd a7, 16(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) ; RV64ZVE32F-NEXT: sd t2, 40(a0) @@ -11097,14 +11097,14 @@ define <4 x double> @mgather_truemask_v4f64(<4 x ptr> %ptrs, <4 x double> %passt ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fld fa5, 0(a1) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: fld fa5, 0(a1) ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 -; RV32ZVE32F-NEXT: fld fa4, 0(a1) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: fld fa4, 0(a1) ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: fld fa3, 0(a1) -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fld fa2, 0(a1) ; RV32ZVE32F-NEXT: fsd fa5, 0(a0) @@ -11375,11 +11375,11 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB97_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -11590,11 +11590,11 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB98_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -11807,11 +11807,11 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB99_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -12031,11 +12031,11 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB100_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -12247,11 +12247,11 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB101_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -12465,11 +12465,11 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB102_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -13348,21 +13348,21 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: lw a7, 0(a2) ; RV32ZVE32F-NEXT: lw t0, 8(a2) ; RV32ZVE32F-NEXT: lw t1, 16(a2) -; RV32ZVE32F-NEXT: lw a2, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v8, a7 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a2, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: andi a3, a2, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB106_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -13807,14 +13807,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64V-NEXT: vsext.vf8 v16, v8 ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64V-NEXT: vslidedown.vi v12, v10, 16 +; RV64V-NEXT: vslidedown.vi v14, v8, 16 +; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64V-NEXT: vslidedown.vi v8, v0, 2 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t -; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64V-NEXT: vslidedown.vi v8, v8, 16 -; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64V-NEXT: vslidedown.vi v0, v0, 2 -; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64V-NEXT: vsext.vf8 v16, v14 +; RV64V-NEXT: vmv1r.v v0, v8 ; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t ; RV64V-NEXT: li a0, 32 @@ -14384,65 +14384,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vadd.vx v8, v8, a0 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: lbu a1, 0(a0) -; RV32-NEXT: lbu a0, 1(a0) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vmv.x.s a2, v10 -; RV32-NEXT: lbu a3, 1(a2) -; RV32-NEXT: lbu a2, 0(a2) -; RV32-NEXT: slli a0, a0, 8 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vslidedown.vi v11, v8, 2 ; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: lbu a4, 0(a1) -; RV32-NEXT: lbu a1, 1(a1) ; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vmv.x.s a5, v10 -; RV32-NEXT: lbu a6, 0(a5) -; RV32-NEXT: lbu a5, 1(a5) -; RV32-NEXT: or a2, a3, a2 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: or a1, a1, a4 -; RV32-NEXT: slli a5, a5, 8 +; RV32-NEXT: vmv.x.s a2, v11 +; RV32-NEXT: vmv.x.s a3, v10 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 4 -; RV32-NEXT: vmv.x.s a3, v10 -; RV32-NEXT: lbu a4, 0(a3) -; RV32-NEXT: lbu a3, 1(a3) +; RV32-NEXT: vmv.x.s a4, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: vmv.x.s a7, v10 -; RV32-NEXT: lbu t0, 0(a7) -; RV32-NEXT: lbu a7, 1(a7) -; RV32-NEXT: or a5, a5, a6 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: or a3, a3, a4 -; RV32-NEXT: slli a7, a7, 8 +; RV32-NEXT: vmv.x.s a5, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 6 -; RV32-NEXT: vmv.x.s a4, v10 -; RV32-NEXT: lbu a6, 0(a4) -; RV32-NEXT: lbu a4, 1(a4) ; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: vmv.x.s t1, v8 -; RV32-NEXT: lbu t2, 0(t1) -; RV32-NEXT: lbu t1, 1(t1) -; RV32-NEXT: or a7, a7, t0 +; RV32-NEXT: lbu a6, 0(a0) +; RV32-NEXT: lbu a0, 1(a0) +; RV32-NEXT: vmv.x.s a7, v10 +; RV32-NEXT: vmv.x.s t0, v8 +; RV32-NEXT: lbu t1, 0(a1) +; RV32-NEXT: lbu a1, 1(a1) +; RV32-NEXT: lbu t2, 0(a2) +; RV32-NEXT: lbu a2, 1(a2) +; RV32-NEXT: slli a0, a0, 8 +; RV32-NEXT: or a0, a0, a6 +; RV32-NEXT: lbu a6, 0(a3) +; RV32-NEXT: lbu a3, 1(a3) +; RV32-NEXT: slli a1, a1, 8 +; RV32-NEXT: or a1, a1, t1 +; RV32-NEXT: lbu t1, 0(a4) +; RV32-NEXT: lbu a4, 1(a4) +; RV32-NEXT: slli a2, a2, 8 +; RV32-NEXT: or a2, a2, t2 +; RV32-NEXT: lbu t2, 0(a5) +; RV32-NEXT: lbu a5, 1(a5) +; RV32-NEXT: slli a3, a3, 8 +; RV32-NEXT: or a3, a3, a6 +; RV32-NEXT: lbu a6, 0(a7) +; RV32-NEXT: lbu a7, 1(a7) ; RV32-NEXT: slli a4, a4, 8 -; RV32-NEXT: or a4, a4, a6 -; RV32-NEXT: slli t1, t1, 8 -; RV32-NEXT: or a6, t1, t2 +; RV32-NEXT: or a4, a4, t1 +; RV32-NEXT: lbu t1, 0(t0) +; RV32-NEXT: lbu t0, 1(t0) +; RV32-NEXT: slli a5, a5, 8 +; RV32-NEXT: or a5, a5, t2 +; RV32-NEXT: slli a7, a7, 8 +; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: slli t0, t0, 8 +; RV32-NEXT: or a7, t0, t1 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v9, v8, a5 -; RV32-NEXT: vmv.v.x v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v8, a4 +; RV32-NEXT: vmv.v.x v9, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v9, v9, a5 +; RV32-NEXT: vslide1down.vx v10, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v9, a6 ; RV32-NEXT: vmv.v.i v0, 15 -; RV32-NEXT: vslide1down.vx v8, v8, a6 -; RV32-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV32-NEXT: vslide1down.vx v8, v8, a7 +; RV32-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_strided_unaligned: @@ -14458,65 +14458,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64V-NEXT: andi sp, sp, -64 ; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64V-NEXT: vid.v v8 +; RV64V-NEXT: mv a1, sp ; RV64V-NEXT: vsll.vi v8, v8, 2 ; RV64V-NEXT: vadd.vx v8, v8, a0 ; RV64V-NEXT: vmv.x.s a0, v8 -; RV64V-NEXT: lbu a1, 0(a0) -; RV64V-NEXT: lbu a0, 1(a0) ; RV64V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64V-NEXT: vslidedown.vi v12, v8, 1 ; RV64V-NEXT: vmv.x.s a2, v12 -; RV64V-NEXT: lbu a3, 1(a2) -; RV64V-NEXT: lbu a2, 0(a2) -; RV64V-NEXT: slli a0, a0, 8 -; RV64V-NEXT: or a0, a0, a1 -; RV64V-NEXT: slli a3, a3, 8 ; RV64V-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64V-NEXT: vslidedown.vi v12, v8, 2 -; RV64V-NEXT: vmv.x.s a1, v12 -; RV64V-NEXT: lbu a4, 0(a1) -; RV64V-NEXT: lbu a1, 1(a1) +; RV64V-NEXT: vmv.x.s a3, v12 ; RV64V-NEXT: vslidedown.vi v12, v8, 3 +; RV64V-NEXT: lbu a4, 0(a0) +; RV64V-NEXT: lbu a0, 1(a0) ; RV64V-NEXT: vmv.x.s a5, v12 -; RV64V-NEXT: lbu a6, 0(a5) +; RV64V-NEXT: lbu a6, 0(a2) +; RV64V-NEXT: lbu a2, 1(a2) +; RV64V-NEXT: lbu a7, 0(a3) +; RV64V-NEXT: lbu a3, 1(a3) +; RV64V-NEXT: lbu t0, 0(a5) ; RV64V-NEXT: lbu a5, 1(a5) -; RV64V-NEXT: or a2, a3, a2 -; RV64V-NEXT: slli a1, a1, 8 -; RV64V-NEXT: or a1, a1, a4 -; RV64V-NEXT: slli a5, a5, 8 -; RV64V-NEXT: mv a3, sp ; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64V-NEXT: vse64.v v8, (a3) +; RV64V-NEXT: vse64.v v8, (a1) +; RV64V-NEXT: slli a0, a0, 8 +; RV64V-NEXT: or a0, a0, a4 +; RV64V-NEXT: slli a2, a2, 8 +; RV64V-NEXT: slli a3, a3, 8 +; RV64V-NEXT: or a1, a2, a6 +; RV64V-NEXT: or a2, a3, a7 ; RV64V-NEXT: ld a3, 32(sp) ; RV64V-NEXT: ld a4, 40(sp) -; RV64V-NEXT: ld a7, 48(sp) -; RV64V-NEXT: ld t0, 56(sp) -; RV64V-NEXT: lbu t1, 0(a3) +; RV64V-NEXT: ld a6, 48(sp) +; RV64V-NEXT: ld a7, 56(sp) +; RV64V-NEXT: slli a5, a5, 8 +; RV64V-NEXT: or a5, a5, t0 +; RV64V-NEXT: lbu t0, 0(a3) ; RV64V-NEXT: lbu a3, 1(a3) -; RV64V-NEXT: lbu t2, 0(a4) +; RV64V-NEXT: vmv.v.x v8, a0 +; RV64V-NEXT: lbu a0, 0(a4) ; RV64V-NEXT: lbu a4, 1(a4) -; RV64V-NEXT: or a5, a5, a6 +; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: lbu a1, 0(a6) +; RV64V-NEXT: lbu a6, 1(a6) +; RV64V-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-NEXT: lbu a2, 0(a7) +; RV64V-NEXT: lbu a7, 1(a7) +; RV64V-NEXT: vslide1down.vx v9, v8, a5 ; RV64V-NEXT: slli a3, a3, 8 -; RV64V-NEXT: or a3, a3, t1 ; RV64V-NEXT: slli a4, a4, 8 -; RV64V-NEXT: lbu a6, 0(a7) -; RV64V-NEXT: lbu a7, 1(a7) -; RV64V-NEXT: lbu t1, 0(t0) -; RV64V-NEXT: lbu t0, 1(t0) -; RV64V-NEXT: or a4, a4, t2 +; RV64V-NEXT: slli a6, a6, 8 ; RV64V-NEXT: slli a7, a7, 8 -; RV64V-NEXT: or a6, a7, a6 -; RV64V-NEXT: slli t0, t0, 8 -; RV64V-NEXT: or a7, t0, t1 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-NEXT: vslide1down.vx v8, v8, a1 -; RV64V-NEXT: vslide1down.vx v9, v8, a5 +; RV64V-NEXT: or a3, a3, t0 +; RV64V-NEXT: or a0, a4, a0 +; RV64V-NEXT: or a1, a6, a1 +; RV64V-NEXT: or a2, a7, a2 ; RV64V-NEXT: vmv.v.x v8, a3 -; RV64V-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-NEXT: vslide1down.vx v8, v8, a1 ; RV64V-NEXT: vmv.v.i v0, 15 -; RV64V-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64V-NEXT: addi sp, s0, -128 ; RV64V-NEXT: .cfi_def_cfa sp, 128 @@ -14530,49 +14530,49 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_strided_unaligned: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: lbu a1, 1(a0) -; RV64ZVE32F-NEXT: lbu a2, 0(a0) -; RV64ZVE32F-NEXT: lbu a3, 5(a0) -; RV64ZVE32F-NEXT: lbu a4, 4(a0) -; RV64ZVE32F-NEXT: slli a1, a1, 8 -; RV64ZVE32F-NEXT: or a1, a1, a2 -; RV64ZVE32F-NEXT: slli a3, a3, 8 -; RV64ZVE32F-NEXT: lbu a2, 8(a0) -; RV64ZVE32F-NEXT: lbu a5, 9(a0) -; RV64ZVE32F-NEXT: lbu a6, 12(a0) -; RV64ZVE32F-NEXT: lbu a7, 13(a0) -; RV64ZVE32F-NEXT: or a3, a3, a4 -; RV64ZVE32F-NEXT: slli a5, a5, 8 -; RV64ZVE32F-NEXT: or a2, a5, a2 -; RV64ZVE32F-NEXT: slli a7, a7, 8 -; RV64ZVE32F-NEXT: lbu a4, 16(a0) -; RV64ZVE32F-NEXT: lbu a5, 17(a0) -; RV64ZVE32F-NEXT: lbu t0, 20(a0) -; RV64ZVE32F-NEXT: lbu t1, 21(a0) -; RV64ZVE32F-NEXT: or a6, a7, a6 -; RV64ZVE32F-NEXT: slli a5, a5, 8 -; RV64ZVE32F-NEXT: or a4, a5, a4 -; RV64ZVE32F-NEXT: slli t1, t1, 8 -; RV64ZVE32F-NEXT: lbu a5, 24(a0) +; RV64ZVE32F-NEXT: lbu a1, 0(a0) +; RV64ZVE32F-NEXT: lbu a2, 1(a0) +; RV64ZVE32F-NEXT: lbu a3, 4(a0) +; RV64ZVE32F-NEXT: lbu a4, 5(a0) +; RV64ZVE32F-NEXT: lbu a5, 8(a0) +; RV64ZVE32F-NEXT: lbu a6, 9(a0) +; RV64ZVE32F-NEXT: lbu a7, 12(a0) +; RV64ZVE32F-NEXT: lbu t0, 13(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 8 +; RV64ZVE32F-NEXT: slli a4, a4, 8 +; RV64ZVE32F-NEXT: or a1, a2, a1 +; RV64ZVE32F-NEXT: or a3, a4, a3 +; RV64ZVE32F-NEXT: lbu a2, 16(a0) +; RV64ZVE32F-NEXT: lbu a4, 17(a0) +; RV64ZVE32F-NEXT: lbu t1, 20(a0) +; RV64ZVE32F-NEXT: lbu t2, 21(a0) +; RV64ZVE32F-NEXT: slli a6, a6, 8 +; RV64ZVE32F-NEXT: or a5, a6, a5 +; RV64ZVE32F-NEXT: slli t0, t0, 8 +; RV64ZVE32F-NEXT: slli a4, a4, 8 +; RV64ZVE32F-NEXT: slli t2, t2, 8 +; RV64ZVE32F-NEXT: or a6, t0, a7 +; RV64ZVE32F-NEXT: or a2, a4, a2 +; RV64ZVE32F-NEXT: lbu a4, 24(a0) ; RV64ZVE32F-NEXT: lbu a7, 25(a0) -; RV64ZVE32F-NEXT: lbu t2, 28(a0) +; RV64ZVE32F-NEXT: or t0, t2, t1 +; RV64ZVE32F-NEXT: lbu t1, 28(a0) ; RV64ZVE32F-NEXT: lbu a0, 29(a0) -; RV64ZVE32F-NEXT: or t0, t1, t0 ; RV64ZVE32F-NEXT: slli a7, a7, 8 -; RV64ZVE32F-NEXT: or a5, a7, a5 -; RV64ZVE32F-NEXT: slli a0, a0, 8 -; RV64ZVE32F-NEXT: or a0, a0, t2 +; RV64ZVE32F-NEXT: or a4, a7, a4 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 +; RV64ZVE32F-NEXT: slli a0, a0, 8 +; RV64ZVE32F-NEXT: or a0, a0, t1 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a6 -; RV64ZVE32F-NEXT: vmv.v.x v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, t0 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14606,16 +14606,16 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 26(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14651,16 +14651,16 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 28(a0) ; RV64ZVE32F-NEXT: lh a0, 30(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14696,16 +14696,16 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a3 +; RV64ZVE32F-NEXT: vmv.v.x v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a2 -; RV64ZVE32F-NEXT: vmv.v.x v8, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14741,16 +14741,16 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 12(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a3 +; RV64ZVE32F-NEXT: vmv.v.x v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a2 -; RV64ZVE32F-NEXT: vmv.v.x v8, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14785,16 +14785,16 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14832,16 +14832,16 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14871,24 +14871,24 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: lh a1, 10(a0) -; RV64ZVE32F-NEXT: lh a2, 18(a0) -; RV64ZVE32F-NEXT: lh a3, 20(a0) -; RV64ZVE32F-NEXT: lh a4, 2(a0) -; RV64ZVE32F-NEXT: lh a5, 4(a0) -; RV64ZVE32F-NEXT: lh a6, 6(a0) -; RV64ZVE32F-NEXT: lh a0, 8(a0) +; RV64ZVE32F-NEXT: lh a1, 2(a0) +; RV64ZVE32F-NEXT: lh a2, 4(a0) +; RV64ZVE32F-NEXT: lh a3, 6(a0) +; RV64ZVE32F-NEXT: lh a4, 8(a0) +; RV64ZVE32F-NEXT: lh a5, 10(a0) +; RV64ZVE32F-NEXT: lh a6, 18(a0) +; RV64ZVE32F-NEXT: lh a0, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.x v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a3 -; RV64ZVE32F-NEXT: vmv.v.x v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a3 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14930,16 +14930,16 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14978,16 +14978,16 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -15035,16 +15035,16 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a4 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -15083,16 +15083,16 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 12(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v8, a2 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -15152,258 +15152,258 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 ; RV32ZVE32F-NEXT: andi sp, sp, -128 ; RV32ZVE32F-NEXT: li a2, 32 +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: sw a3, 236(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a3, 4(a1) +; RV32ZVE32F-NEXT: sw a3, 232(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: addi a3, sp, 256 ; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32ZVE32F-NEXT: vid.v v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 4 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 252(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 248(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVE32F-NEXT: vslidedown.vi v17, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 244(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 240(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 220(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 216(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 212(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 208(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vmv.x.s a4, v17 +; RV32ZVE32F-NEXT: vmv.x.s a5, v16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 4 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 236(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 232(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vmv.x.s a6, v16 ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 5 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 228(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 224(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vmv.x.s a7, v16 ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 6 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 204(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 200(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vmv.x.s t0, v16 ; RV32ZVE32F-NEXT: vslidedown.vi v16, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v16 -; RV32ZVE32F-NEXT: lw a3, 0(a1) -; RV32ZVE32F-NEXT: sw a3, 196(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: vmv.x.s t1, v16 +; RV32ZVE32F-NEXT: lw t2, 0(a1) +; RV32ZVE32F-NEXT: sw t2, 196(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: sw a1, 192(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: addi a1, sp, 256 +; RV32ZVE32F-NEXT: lw ra, 0(a4) +; RV32ZVE32F-NEXT: lw a1, 4(a4) +; RV32ZVE32F-NEXT: sw a1, 172(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(a5) +; RV32ZVE32F-NEXT: sw a1, 168(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a5) +; RV32ZVE32F-NEXT: sw a1, 164(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(a6) +; RV32ZVE32F-NEXT: sw a1, 252(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a6) +; RV32ZVE32F-NEXT: sw a1, 248(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(a7) +; RV32ZVE32F-NEXT: sw a1, 244(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a7) +; RV32ZVE32F-NEXT: sw a1, 240(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(t0) +; RV32ZVE32F-NEXT: sw a1, 188(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(t0) +; RV32ZVE32F-NEXT: sw a1, 184(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(t1) +; RV32ZVE32F-NEXT: sw a1, 180(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(t1) +; RV32ZVE32F-NEXT: sw a1, 176(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32ZVE32F-NEXT: vse32.v v8, (a1) +; RV32ZVE32F-NEXT: vse32.v v8, (a3) ; RV32ZVE32F-NEXT: lw a1, 288(sp) ; RV32ZVE32F-NEXT: lw a2, 292(sp) ; RV32ZVE32F-NEXT: lw a3, 296(sp) ; RV32ZVE32F-NEXT: lw a4, 300(sp) ; RV32ZVE32F-NEXT: lw a5, 0(a1) -; RV32ZVE32F-NEXT: sw a5, 188(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a5, 228(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 184(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 224(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a2) -; RV32ZVE32F-NEXT: sw a1, 180(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 220(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: sw a1, 176(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 216(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a3) -; RV32ZVE32F-NEXT: sw a1, 172(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 212(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: sw a1, 168(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 208(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a4) -; RV32ZVE32F-NEXT: sw a1, 164(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 204(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a4) -; RV32ZVE32F-NEXT: sw a1, 160(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 200(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 304(sp) ; RV32ZVE32F-NEXT: lw a2, 308(sp) ; RV32ZVE32F-NEXT: lw a3, 312(sp) ; RV32ZVE32F-NEXT: lw a4, 316(sp) ; RV32ZVE32F-NEXT: lw a5, 0(a1) -; RV32ZVE32F-NEXT: sw a5, 156(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a5, 160(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a1) -; RV32ZVE32F-NEXT: sw a1, 152(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 156(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a2) -; RV32ZVE32F-NEXT: sw a1, 148(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 152(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: sw a1, 144(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 148(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a3) -; RV32ZVE32F-NEXT: sw a1, 140(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 144(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: sw a1, 136(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 140(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 0(a4) -; RV32ZVE32F-NEXT: sw a1, 132(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 136(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a4) -; RV32ZVE32F-NEXT: sw a1, 128(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw a1, 132(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 320(sp) ; RV32ZVE32F-NEXT: lw a2, 324(sp) ; RV32ZVE32F-NEXT: lw a3, 328(sp) ; RV32ZVE32F-NEXT: lw a4, 332(sp) -; RV32ZVE32F-NEXT: lw s8, 0(a1) -; RV32ZVE32F-NEXT: lw s9, 4(a1) -; RV32ZVE32F-NEXT: lw s10, 0(a2) -; RV32ZVE32F-NEXT: lw s11, 4(a2) +; RV32ZVE32F-NEXT: lw a5, 0(a1) +; RV32ZVE32F-NEXT: sw a5, 128(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 124(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: sw a1, 120(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: sw a1, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw s8, 0(a3) +; RV32ZVE32F-NEXT: lw s9, 4(a3) +; RV32ZVE32F-NEXT: lw s10, 0(a4) +; RV32ZVE32F-NEXT: lw s11, 4(a4) +; RV32ZVE32F-NEXT: lw a1, 336(sp) +; RV32ZVE32F-NEXT: lw a2, 340(sp) +; RV32ZVE32F-NEXT: lw a3, 344(sp) +; RV32ZVE32F-NEXT: lw a4, 348(sp) +; RV32ZVE32F-NEXT: lw t5, 0(a1) +; RV32ZVE32F-NEXT: lw t6, 4(a1) +; RV32ZVE32F-NEXT: lw s2, 0(a2) +; RV32ZVE32F-NEXT: lw s3, 4(a2) +; RV32ZVE32F-NEXT: lw a5, 0(a3) +; RV32ZVE32F-NEXT: lw a6, 4(a3) +; RV32ZVE32F-NEXT: lw a7, 0(a4) +; RV32ZVE32F-NEXT: lw t0, 4(a4) +; RV32ZVE32F-NEXT: lw a1, 352(sp) +; RV32ZVE32F-NEXT: lw a2, 356(sp) +; RV32ZVE32F-NEXT: lw a3, 360(sp) +; RV32ZVE32F-NEXT: lw a4, 364(sp) +; RV32ZVE32F-NEXT: lw t1, 0(a1) +; RV32ZVE32F-NEXT: sw t1, 112(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a1) +; RV32ZVE32F-NEXT: sw a1, 108(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: sw a1, 104(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: sw a1, 100(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw s4, 0(a3) ; RV32ZVE32F-NEXT: lw s5, 4(a3) ; RV32ZVE32F-NEXT: lw s6, 0(a4) ; RV32ZVE32F-NEXT: lw s7, 4(a4) -; RV32ZVE32F-NEXT: lw a2, 336(sp) -; RV32ZVE32F-NEXT: lw a4, 340(sp) -; RV32ZVE32F-NEXT: lw a5, 344(sp) -; RV32ZVE32F-NEXT: lw a6, 348(sp) -; RV32ZVE32F-NEXT: lw a7, 0(a2) -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw t1, 0(a4) -; RV32ZVE32F-NEXT: lw t2, 4(a4) -; RV32ZVE32F-NEXT: lw a1, 0(a5) -; RV32ZVE32F-NEXT: lw a2, 4(a5) -; RV32ZVE32F-NEXT: lw a3, 0(a6) -; RV32ZVE32F-NEXT: lw a4, 4(a6) -; RV32ZVE32F-NEXT: lw a5, 352(sp) -; RV32ZVE32F-NEXT: lw a6, 356(sp) -; RV32ZVE32F-NEXT: lw t3, 360(sp) -; RV32ZVE32F-NEXT: lw t4, 364(sp) -; RV32ZVE32F-NEXT: lw t5, 0(a5) -; RV32ZVE32F-NEXT: sw t5, 116(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 4(a5) -; RV32ZVE32F-NEXT: sw a5, 112(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 0(a6) -; RV32ZVE32F-NEXT: sw a5, 124(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 4(a6) -; RV32ZVE32F-NEXT: sw a5, 120(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw ra, 0(t3) -; RV32ZVE32F-NEXT: lw a5, 4(t3) -; RV32ZVE32F-NEXT: sw a5, 108(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 0(t4) -; RV32ZVE32F-NEXT: sw a5, 104(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 4(t4) -; RV32ZVE32F-NEXT: sw a5, 100(sp) # 4-byte Folded Spill -; RV32ZVE32F-NEXT: lw a5, 368(sp) -; RV32ZVE32F-NEXT: lw a6, 372(sp) -; RV32ZVE32F-NEXT: lw t3, 376(sp) -; RV32ZVE32F-NEXT: lw t4, 380(sp) -; RV32ZVE32F-NEXT: lw t5, 0(a5) -; RV32ZVE32F-NEXT: lw t6, 4(a5) -; RV32ZVE32F-NEXT: lw s2, 0(a6) -; RV32ZVE32F-NEXT: lw s3, 4(a6) -; RV32ZVE32F-NEXT: lw a5, 0(t3) -; RV32ZVE32F-NEXT: lw a6, 4(t3) -; RV32ZVE32F-NEXT: lw t3, 0(t4) -; RV32ZVE32F-NEXT: lw t4, 4(t4) -; RV32ZVE32F-NEXT: sw a1, 176(a0) -; RV32ZVE32F-NEXT: sw a2, 180(a0) -; RV32ZVE32F-NEXT: sw a3, 184(a0) -; RV32ZVE32F-NEXT: sw a4, 188(a0) -; RV32ZVE32F-NEXT: sw a7, 160(a0) -; RV32ZVE32F-NEXT: sw t0, 164(a0) -; RV32ZVE32F-NEXT: sw t1, 168(a0) -; RV32ZVE32F-NEXT: sw t2, 172(a0) -; RV32ZVE32F-NEXT: sw s4, 144(a0) -; RV32ZVE32F-NEXT: sw s5, 148(a0) -; RV32ZVE32F-NEXT: sw s6, 152(a0) -; RV32ZVE32F-NEXT: sw s7, 156(a0) -; RV32ZVE32F-NEXT: sw s8, 128(a0) -; RV32ZVE32F-NEXT: sw s9, 132(a0) -; RV32ZVE32F-NEXT: sw s10, 136(a0) -; RV32ZVE32F-NEXT: sw s11, 140(a0) -; RV32ZVE32F-NEXT: lw a1, 140(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 112(a0) -; RV32ZVE32F-NEXT: lw a1, 136(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 116(a0) -; RV32ZVE32F-NEXT: lw a1, 132(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 120(a0) -; RV32ZVE32F-NEXT: lw a1, 128(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 124(a0) -; RV32ZVE32F-NEXT: lw a1, 156(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 96(a0) -; RV32ZVE32F-NEXT: lw a1, 152(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 100(a0) -; RV32ZVE32F-NEXT: lw a1, 148(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 104(a0) -; RV32ZVE32F-NEXT: lw a1, 144(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 108(a0) -; RV32ZVE32F-NEXT: lw a1, 172(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 80(a0) -; RV32ZVE32F-NEXT: lw a1, 168(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 84(a0) -; RV32ZVE32F-NEXT: lw a1, 164(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 88(a0) -; RV32ZVE32F-NEXT: lw a1, 160(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 92(a0) -; RV32ZVE32F-NEXT: lw a1, 188(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 64(a0) -; RV32ZVE32F-NEXT: lw a1, 184(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 68(a0) -; RV32ZVE32F-NEXT: lw a1, 180(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 72(a0) -; RV32ZVE32F-NEXT: lw a1, 176(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 76(a0) -; RV32ZVE32F-NEXT: sw a5, 240(a0) -; RV32ZVE32F-NEXT: sw a6, 244(a0) -; RV32ZVE32F-NEXT: sw t3, 248(a0) -; RV32ZVE32F-NEXT: sw t4, 252(a0) -; RV32ZVE32F-NEXT: sw t5, 224(a0) -; RV32ZVE32F-NEXT: sw t6, 228(a0) -; RV32ZVE32F-NEXT: sw s2, 232(a0) -; RV32ZVE32F-NEXT: sw s3, 236(a0) -; RV32ZVE32F-NEXT: sw ra, 208(a0) -; RV32ZVE32F-NEXT: lw a1, 108(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 212(a0) -; RV32ZVE32F-NEXT: lw a1, 104(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 216(a0) -; RV32ZVE32F-NEXT: lw a1, 100(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 220(a0) -; RV32ZVE32F-NEXT: lw a1, 116(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 192(a0) +; RV32ZVE32F-NEXT: lw a1, 368(sp) +; RV32ZVE32F-NEXT: lw a2, 372(sp) +; RV32ZVE32F-NEXT: lw a3, 376(sp) +; RV32ZVE32F-NEXT: lw a4, 380(sp) +; RV32ZVE32F-NEXT: lw t1, 0(a1) +; RV32ZVE32F-NEXT: lw t2, 4(a1) +; RV32ZVE32F-NEXT: lw t3, 0(a2) +; RV32ZVE32F-NEXT: lw t4, 4(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a4) +; RV32ZVE32F-NEXT: lw a4, 4(a4) +; RV32ZVE32F-NEXT: sw ra, 16(a0) +; RV32ZVE32F-NEXT: lw ra, 172(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 20(a0) +; RV32ZVE32F-NEXT: lw ra, 168(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 24(a0) +; RV32ZVE32F-NEXT: lw ra, 164(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 28(a0) +; RV32ZVE32F-NEXT: lw ra, 236(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 0(a0) +; RV32ZVE32F-NEXT: lw ra, 232(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 4(a0) +; RV32ZVE32F-NEXT: lw ra, 196(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 8(a0) +; RV32ZVE32F-NEXT: lw ra, 192(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 12(a0) +; RV32ZVE32F-NEXT: lw ra, 188(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 48(a0) +; RV32ZVE32F-NEXT: lw ra, 184(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 52(a0) +; RV32ZVE32F-NEXT: lw ra, 180(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 56(a0) +; RV32ZVE32F-NEXT: lw ra, 176(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw ra, 60(a0) +; RV32ZVE32F-NEXT: sw a5, 176(a0) +; RV32ZVE32F-NEXT: sw a6, 180(a0) +; RV32ZVE32F-NEXT: sw a7, 184(a0) +; RV32ZVE32F-NEXT: sw t0, 188(a0) +; RV32ZVE32F-NEXT: sw t5, 160(a0) +; RV32ZVE32F-NEXT: sw t6, 164(a0) +; RV32ZVE32F-NEXT: sw s2, 168(a0) +; RV32ZVE32F-NEXT: sw s3, 172(a0) +; RV32ZVE32F-NEXT: sw s8, 144(a0) +; RV32ZVE32F-NEXT: sw s9, 148(a0) +; RV32ZVE32F-NEXT: sw s10, 152(a0) +; RV32ZVE32F-NEXT: sw s11, 156(a0) +; RV32ZVE32F-NEXT: lw a5, 128(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 128(a0) +; RV32ZVE32F-NEXT: lw a5, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 132(a0) +; RV32ZVE32F-NEXT: lw a5, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 136(a0) +; RV32ZVE32F-NEXT: lw a5, 116(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 140(a0) +; RV32ZVE32F-NEXT: lw a5, 144(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 112(a0) +; RV32ZVE32F-NEXT: lw a5, 140(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 116(a0) +; RV32ZVE32F-NEXT: lw a5, 136(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 120(a0) +; RV32ZVE32F-NEXT: lw a5, 132(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 124(a0) +; RV32ZVE32F-NEXT: lw a5, 160(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 96(a0) +; RV32ZVE32F-NEXT: lw a5, 156(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 100(a0) +; RV32ZVE32F-NEXT: lw a5, 152(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 104(a0) +; RV32ZVE32F-NEXT: lw a5, 148(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 108(a0) +; RV32ZVE32F-NEXT: lw a5, 212(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 80(a0) +; RV32ZVE32F-NEXT: lw a5, 208(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 84(a0) +; RV32ZVE32F-NEXT: lw a5, 204(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 88(a0) +; RV32ZVE32F-NEXT: lw a5, 200(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 92(a0) +; RV32ZVE32F-NEXT: lw a5, 228(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 64(a0) +; RV32ZVE32F-NEXT: lw a5, 224(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 68(a0) +; RV32ZVE32F-NEXT: lw a5, 220(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 72(a0) +; RV32ZVE32F-NEXT: lw a5, 216(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a5, 76(a0) +; RV32ZVE32F-NEXT: sw a1, 240(a0) +; RV32ZVE32F-NEXT: sw a2, 244(a0) +; RV32ZVE32F-NEXT: sw a3, 248(a0) +; RV32ZVE32F-NEXT: sw a4, 252(a0) +; RV32ZVE32F-NEXT: sw t1, 224(a0) +; RV32ZVE32F-NEXT: sw t2, 228(a0) +; RV32ZVE32F-NEXT: sw t3, 232(a0) +; RV32ZVE32F-NEXT: sw t4, 236(a0) +; RV32ZVE32F-NEXT: sw s4, 208(a0) +; RV32ZVE32F-NEXT: sw s5, 212(a0) +; RV32ZVE32F-NEXT: sw s6, 216(a0) +; RV32ZVE32F-NEXT: sw s7, 220(a0) ; RV32ZVE32F-NEXT: lw a1, 112(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: sw a1, 192(a0) +; RV32ZVE32F-NEXT: lw a1, 108(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 196(a0) -; RV32ZVE32F-NEXT: lw a1, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw a1, 104(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 200(a0) -; RV32ZVE32F-NEXT: lw a1, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw a1, 100(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 204(a0) -; RV32ZVE32F-NEXT: lw a1, 220(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 16(a0) -; RV32ZVE32F-NEXT: lw a1, 216(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 20(a0) -; RV32ZVE32F-NEXT: lw a1, 212(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 24(a0) -; RV32ZVE32F-NEXT: lw a1, 208(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 28(a0) ; RV32ZVE32F-NEXT: lw a1, 252(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 0(a0) -; RV32ZVE32F-NEXT: lw a1, 248(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: lw a1, 244(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 8(a0) -; RV32ZVE32F-NEXT: lw a1, 240(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 12(a0) -; RV32ZVE32F-NEXT: lw a1, 204(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 48(a0) -; RV32ZVE32F-NEXT: lw a1, 200(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 52(a0) -; RV32ZVE32F-NEXT: lw a1, 196(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 56(a0) -; RV32ZVE32F-NEXT: lw a1, 192(sp) # 4-byte Folded Reload -; RV32ZVE32F-NEXT: sw a1, 60(a0) -; RV32ZVE32F-NEXT: lw a1, 236(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 32(a0) -; RV32ZVE32F-NEXT: lw a1, 232(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw a1, 248(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 36(a0) -; RV32ZVE32F-NEXT: lw a1, 228(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw a1, 244(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 40(a0) -; RV32ZVE32F-NEXT: lw a1, 224(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw a1, 240(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 44(a0) ; RV32ZVE32F-NEXT: addi sp, s0, -512 ; RV32ZVE32F-NEXT: .cfi_def_cfa sp, 512 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll index dbbec96445e3e..f72b08a405246 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll @@ -278,12 +278,12 @@ define <64 x float> @masked_load_v64f32(ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_load_v64f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vle32.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <64 x float> @llvm.masked.load.v64f32(ptr %a, i32 8, <64 x i1> %mask, <64 x float> undef) @@ -294,12 +294,12 @@ define <128 x bfloat> @masked_load_v128bf16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x bfloat> @llvm.masked.load.v128bf16(ptr %a, i32 8, <128 x i1> %mask, <128 x bfloat> undef) @@ -310,12 +310,12 @@ define <128 x half> @masked_load_v128f16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x half> @llvm.masked.load.v128f16(ptr %a, i32 8, <128 x i1> %mask, <128 x half> undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll index 4f3313f3760be..e0cf39c75da24 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll @@ -276,12 +276,12 @@ define <64 x i32> @masked_load_v64i32(ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_load_v64i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vle32.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <64 x i32> @llvm.masked.load.v64i32(ptr %a, i32 8, <64 x i1> %mask, <64 x i32> undef) @@ -303,12 +303,12 @@ define <128 x i16> @masked_load_v128i16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x i16> @llvm.masked.load.v128i16(ptr %a, i32 8, <128 x i1> %mask, <128 x i16> undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 476d023b9ad6f..575a757149ebb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -349,12 +349,12 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vse8.v v8, (a1) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v9, (a3) ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a3) ; RV64ZVE32F-NEXT: vse8.v v8, (a0) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1)) @@ -867,12 +867,12 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vse16.v v8, (a1) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-NEXT: vse16.v v8, (a0) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1)) @@ -1744,12 +1744,12 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v8, (a1) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) ; RV64ZVE32F-NEXT: vse32.v v8, (a0) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1)) @@ -3097,20 +3097,20 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) { ; RV32ZVE32F-NEXT: lw a0, 12(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v8 -; RV32ZVE32F-NEXT: sw a5, 0(t0) -; RV32ZVE32F-NEXT: sw a6, 4(t0) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: sw a7, 0(a5) -; RV32ZVE32F-NEXT: sw a0, 4(a5) +; RV32ZVE32F-NEXT: vmv.x.s t1, v9 ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a0, v9 -; RV32ZVE32F-NEXT: sw a1, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 0(t0) +; RV32ZVE32F-NEXT: sw a6, 4(t0) +; RV32ZVE32F-NEXT: vmv.x.s a5, v9 +; RV32ZVE32F-NEXT: vmv.x.s a6, v8 +; RV32ZVE32F-NEXT: sw a7, 0(t1) +; RV32ZVE32F-NEXT: sw a0, 4(t1) +; RV32ZVE32F-NEXT: sw a1, 0(a5) +; RV32ZVE32F-NEXT: sw a2, 4(a5) +; RV32ZVE32F-NEXT: sw a3, 0(a6) +; RV32ZVE32F-NEXT: sw a4, 4(a6) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64: @@ -5693,6 +5693,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: sw s6, 20(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: sw s7, 16(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: sw s8, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 @@ -5702,6 +5703,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: .cfi_offset s6, -28 ; RV32ZVE32F-NEXT: .cfi_offset s7, -32 ; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 ; RV32ZVE32F-NEXT: .cfi_remember_state ; RV32ZVE32F-NEXT: lw a3, 56(a0) ; RV32ZVE32F-NEXT: lw a4, 60(a0) @@ -5724,21 +5726,21 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw s6, 0(a2) ; RV32ZVE32F-NEXT: lw s7, 8(a2) ; RV32ZVE32F-NEXT: lw s8, 16(a2) -; RV32ZVE32F-NEXT: lw a2, 24(a2) +; RV32ZVE32F-NEXT: lw s9, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v8, s6 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a2, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s9 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: andi s2, a2, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez s2, .LBB51_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -5778,6 +5780,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: lw s6, 20(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s7, 16(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s8, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: .cfi_restore s0 ; RV32ZVE32F-NEXT: .cfi_restore s1 ; RV32ZVE32F-NEXT: .cfi_restore s2 @@ -5787,6 +5790,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: .cfi_restore s6 ; RV32ZVE32F-NEXT: .cfi_restore s7 ; RV32ZVE32F-NEXT: .cfi_restore s8 +; RV32ZVE32F-NEXT: .cfi_restore s9 ; RV32ZVE32F-NEXT: addi sp, sp, 48 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 0 ; RV32ZVE32F-NEXT: ret @@ -6146,19 +6150,19 @@ define void @mscatter_truemask_v4bf16(<4 x bfloat> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a1) -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 -; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 +; RV64ZVE32F-NEXT: fmv.h.x fa5, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-NEXT: fsh fa5, 0(a3) -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 -; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 +; RV64ZVE32F-NEXT: fmv.h.x fa5, a4 ; RV64ZVE32F-NEXT: fsh fa5, 0(a0) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1)) @@ -6318,10 +6322,10 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: beqz a2, .LBB58_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB58_2: # %else @@ -6331,11 +6335,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB58_4: # %else2 @@ -6358,11 +6362,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB58_9: # %else10 @@ -6377,11 +6381,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB58_12: # %cond.store3 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6390,11 +6394,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 @@ -6402,11 +6406,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 @@ -6414,11 +6418,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: j .LBB58_9 ; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6426,11 +6430,13 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-NEXT: fsh fa5, 0(a0) @@ -6467,10 +6473,10 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB59_2: # %else @@ -6480,11 +6486,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB59_4: # %else2 @@ -6507,11 +6513,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB59_9: # %else10 @@ -6526,11 +6532,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB59_12: # %cond.store3 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6539,11 +6545,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 @@ -6551,11 +6557,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 @@ -6563,11 +6569,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: j .LBB59_9 ; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6575,11 +6581,13 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-NEXT: fsh fa5, 0(a0) @@ -6615,11 +6623,11 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB60_2: # %else @@ -6629,12 +6637,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB60_4: # %else2 @@ -6657,12 +6665,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB60_9: # %else10 @@ -6677,12 +6685,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB60_12: # %cond.store3 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6691,12 +6699,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 @@ -6704,12 +6712,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 @@ -6717,12 +6725,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: j .LBB60_9 ; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6730,12 +6738,14 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-NEXT: fsh fa5, 0(a0) @@ -6772,9 +6782,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB61_2: # %else @@ -6784,11 +6794,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB61_4: # %else2 @@ -6811,11 +6821,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB61_9: # %else10 @@ -6830,11 +6840,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB61_12: # %cond.store3 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6843,11 +6853,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 @@ -6855,10 +6865,10 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 @@ -6866,11 +6876,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: j .LBB61_9 ; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6878,11 +6888,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-NEXT: fsh fa5, 0(a0) @@ -7146,12 +7156,12 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-ZVFH-NEXT: ld a3, 16(a0) ; RV64ZVE32F-ZVFH-NEXT: ld a0, 24(a0) ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vse16.v v8, (a1) ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v8, (a1) ; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-ZVFH-NEXT: vse16.v v8, (a0) ; RV64ZVE32F-ZVFH-NEXT: ret ; @@ -7163,19 +7173,19 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-ZVFHMIN-NEXT: ld a0, 24(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a1) -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a3) -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a4 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: ret call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1)) @@ -7529,10 +7539,10 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_2 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_2: # %else @@ -7542,11 +7552,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_4: # %else2 @@ -7569,11 +7579,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_9: # %else10 @@ -7588,11 +7598,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_12: # %cond.store3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -7601,11 +7611,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 @@ -7613,11 +7623,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_14: # %cond.store7 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 @@ -7625,11 +7635,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB68_9 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -7637,11 +7647,13 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a1, a1, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a0, a0, a1 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a0) @@ -7788,10 +7800,10 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_2 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_2: # %else @@ -7801,11 +7813,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_4: # %else2 @@ -7828,11 +7840,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_9: # %else10 @@ -7847,11 +7859,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_12: # %cond.store3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -7860,11 +7872,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 @@ -7872,11 +7884,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_14: # %cond.store7 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 @@ -7884,11 +7896,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB69_9 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -7896,11 +7908,13 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a1, a1, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a0, a0, a1 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a0) @@ -8054,11 +8068,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_2 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_2: # %else @@ -8068,12 +8082,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_4: # %else2 @@ -8096,12 +8110,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_9: # %else10 @@ -8116,12 +8130,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_12: # %cond.store3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8130,12 +8144,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 @@ -8143,12 +8157,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_14: # %cond.store7 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 @@ -8156,12 +8170,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB70_9 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a2, 255 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -8169,12 +8183,14 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, 255 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a1, a1, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a0, a0, a1 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a0) @@ -8320,9 +8336,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_2: # %else @@ -8332,11 +8348,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_4: # %else2 @@ -8359,11 +8375,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_9: # %else10 @@ -8378,11 +8394,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_12: # %cond.store3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8391,11 +8407,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 @@ -8403,10 +8419,10 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_14: # %cond.store7 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 @@ -8414,11 +8430,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB71_9 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -8426,11 +8442,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a1, a1, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a0, a0, a1 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 7 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a0) @@ -8603,12 +8619,12 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) { ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v8, (a1) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) ; RV64ZVE32F-NEXT: vse32.v v8, (a0) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1)) @@ -9925,14 +9941,14 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) { ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: fsd fa0, 0(a0) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 -; RV32ZVE32F-NEXT: fsd fa1, 0(a0) ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: fsd fa1, 0(a0) ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: fsd fa2, 0(a0) -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa3, 0(a0) ; RV32ZVE32F-NEXT: ret @@ -10153,11 +10169,11 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB91_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -10353,11 +10369,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB92_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -10555,11 +10571,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB93_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -10764,11 +10780,11 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB94_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -10965,11 +10981,11 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB95_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -11168,11 +11184,11 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB96_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -11991,21 +12007,21 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx ; RV32ZVE32F-NEXT: lw a6, 0(a1) ; RV32ZVE32F-NEXT: lw a7, 8(a1) ; RV32ZVE32F-NEXT: lw t0, 16(a1) -; RV32ZVE32F-NEXT: lw a1, 24(a1) +; RV32ZVE32F-NEXT: lw t1, 24(a1) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v8, a6 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 ; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB100_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -12902,8 +12918,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) { ; RV64ZVE32F-NEXT: addi a6, a0, 10 ; RV64ZVE32F-NEXT: addi a7, a0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vse16.v v8, (a7) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a7) ; RV64ZVE32F-NEXT: vse16.v v9, (a6) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a5) @@ -12914,8 +12930,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) { ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: vse16.v v8, (a1) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll index f7e311d06c03a..ed6ec4d5659b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll @@ -278,12 +278,12 @@ define void @masked_store_v64f32(<64 x float> %val, ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_store_v64f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse32.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v64f32.p0(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask) @@ -294,12 +294,12 @@ define void @masked_store_v128bf16(<128 x bfloat> %val, ptr %a, <128 x i1> %mask ; CHECK-LABEL: masked_store_v128bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128bf16.p0(<128 x bfloat> %val, ptr %a, i32 8, <128 x i1> %mask) @@ -310,12 +310,12 @@ define void @masked_store_v128f16(<128 x half> %val, ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_store_v128f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128f16.p0(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index 0c9bf9a09fd6d..c3b10db115bae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -276,12 +276,12 @@ define void @masked_store_v64i32(<64 x i32> %val, ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_store_v64i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse32.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v64i32.p0(<64 x i32> %val, ptr %a, i32 8, <64 x i1> %mask) @@ -303,12 +303,12 @@ define void @masked_store_v128i16(<128 x i16> %val, ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_store_v128i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 8 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128i16.p0(<128 x i16> %val, ptr %a, i32 8, <128 x i1> %mask) @@ -321,10 +321,10 @@ define void @masked_store_v256i8(<256 x i8> %val, ptr %a, <256 x i1> %mask) { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v24, (a1) +; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vse8.v v8, (a0), v0.t -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vse8.v v16, (a0), v0.t +; CHECK-NEXT: vse8.v v16, (a1), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v256i8.p0(<256 x i8> %val, ptr %a, i32 8, <256 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index 3fab9ce636786..46c2033d28b38 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -135,9 +135,9 @@ declare <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI6_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -393,9 +393,9 @@ declare <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -437,9 +437,9 @@ declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -481,9 +481,9 @@ declare <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -525,9 +525,9 @@ declare <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -578,9 +578,15 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -589,14 +595,18 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t @@ -610,6 +620,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v @@ -624,30 +640,33 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: frflags a2 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: frflags a1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a1 +; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: frflags a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index a8798474d669a..4f0f5dd78c94b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -437,8 +437,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) { ; RV32-NEXT: vslidedown.vi v9, v8, 1 ; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: vslidedown.vi v9, v8, 2 -; RV32-NEXT: vmv.x.s a2, v9 ; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vmv.x.s a2, v9 ; RV32-NEXT: vmv.x.s a3, v8 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a0, a0, a3 @@ -452,8 +452,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) { ; RV64-NEXT: vslidedown.vi v9, v8, 1 ; RV64-NEXT: vmv.x.s a1, v9 ; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vmv.x.s a2, v9 ; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: vmv.x.s a2, v9 ; RV64-NEXT: vmv.x.s a3, v8 ; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: add a0, a0, a3 @@ -856,13 +856,13 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vfmv.f.s fa5, v9 ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa4, v8 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: fadd.s fa0, fa4, fa5 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %p, align 256 @@ -887,8 +887,8 @@ define float @reduce_fadd_4xi32_non_associative2(ptr %p) { ; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vfmv.f.s fa3, v9 ; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vfmv.f.s fa3, v9 ; CHECK-NEXT: vfmv.f.s fa2, v8 ; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fadd.s fa4, fa3, fa2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 487234674befe..8bf30f8f0d072 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -797,13 +797,13 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) { ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 -; CHECK-NEXT: vfwredosum.vs v8, v16, v8 +; CHECK-NEXT: vfwredosum.vs v8, v8, v16 +; CHECK-NEXT: vfwredosum.vs v8, v24, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1194,13 +1194,13 @@ define double @vreduce_ord_fwadd_v32f64(ptr %x, double %s) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 -; CHECK-NEXT: vfwredosum.vs v8, v16, v8 +; CHECK-NEXT: vfwredosum.vs v8, v8, v16 +; CHECK-NEXT: vfwredosum.vs v8, v24, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1364,17 +1364,17 @@ define float @vreduce_fmin_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmin_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v16, v24, v16 -; CHECK-NEXT: vfmin.vv v8, v8, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmin.vv v8, v24, v8 +; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1611,17 +1611,17 @@ define float @vreduce_fmax_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmax_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v16, v24, v16 -; CHECK-NEXT: vfmax.vv v8, v8, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmax.vv v8, v24, v8 +; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2013,80 +2013,61 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v24, v24, v8 -; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2101,10 +2082,7 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB121_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2119,17 +2097,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v16, v24, v16 -; CHECK-NEXT: vfmin.vv v8, v8, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmin.vv v8, v24, v8 +; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2353,79 +2331,60 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v24, v24, v8 -; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2440,10 +2399,7 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB133_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2459,9 +2415,9 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: addi a2, a0, 384 -; CHECK-NEXT: vle64.v v16, (a2) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: vle64.v v0, (a1) @@ -2771,80 +2727,61 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v24, v24, v8 -; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -2859,10 +2796,7 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB149_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2877,17 +2811,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v16, v24, v16 -; CHECK-NEXT: vfmax.vv v8, v8, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmax.vv v8, v24, v8 +; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -3111,79 +3045,60 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v24, v24, v8 -; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 @@ -3198,10 +3113,7 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB161_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -3217,9 +3129,9 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: addi a2, a0, 384 -; CHECK-NEXT: vle64.v v16, (a2) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: vle64.v v0, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll index 0a3c4874c5e8b..f920e39e7d295 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -811,9 +811,9 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1> ; CHECK-NEXT: .LBB49_2: ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vmv.s.x v25, a0 +; CHECK-NEXT: addi a0, a1, -32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t -; CHECK-NEXT: addi a0, a1, -32 ; CHECK-NEXT: sltu a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 @@ -838,10 +838,10 @@ define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -873,10 +873,10 @@ define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -908,10 +908,10 @@ define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -943,10 +943,10 @@ define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -978,10 +978,10 @@ define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1013,10 +1013,10 @@ define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredand.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1048,10 +1048,10 @@ define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredor.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1083,10 +1083,10 @@ define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1118,10 +1118,10 @@ define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredsum.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1153,10 +1153,10 @@ define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmaxu.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1188,10 +1188,10 @@ define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmax.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1223,10 +1223,10 @@ define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredminu.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1258,10 +1258,10 @@ define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> % ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmin.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1293,10 +1293,10 @@ define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredand.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1328,10 +1328,10 @@ define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredor.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1363,10 +1363,10 @@ define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredxor.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1789,24 +1789,24 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: lui a2, %hi(.LCPI72_0) -; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vle8.v v12, (a2) ; RV32-NEXT: mv a2, a0 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: lui a3, %hi(.LCPI72_0) +; RV32-NEXT: addi a3, a3, %lo(.LCPI72_0) +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vle8.v v12, (a3) ; RV32-NEXT: vid.v v16 ; RV32-NEXT: vmsltu.vx v14, v16, a1 +; RV32-NEXT: li a3, 64 ; RV32-NEXT: vsext.vf4 v16, v12 ; RV32-NEXT: vmsltu.vx v12, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v14, v12, 4 -; RV32-NEXT: li a0, 64 -; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: vsetvli zero, a3, e8, m4, ta, ma ; RV32-NEXT: vmand.mm v0, v14, v0 ; RV32-NEXT: vmv.v.i v12, 1 ; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV32-NEXT: vslidedown.vx v12, v8, a3 +; RV32-NEXT: vslidedown.vx v12, v8, a0 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vmul.vv v8, v8, v12 @@ -1835,24 +1835,24 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: li a3, 32 -; RV64-NEXT: lui a2, %hi(.LCPI72_0) -; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle8.v v12, (a2) ; RV64-NEXT: mv a2, a0 +; RV64-NEXT: li a0, 32 +; RV64-NEXT: lui a3, %hi(.LCPI72_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI72_0) +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vle8.v v12, (a3) ; RV64-NEXT: vid.v v16 ; RV64-NEXT: vmsltu.vx v14, v16, a1 +; RV64-NEXT: li a3, 64 ; RV64-NEXT: vsext.vf4 v16, v12 ; RV64-NEXT: vmsltu.vx v12, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v14, v12, 4 -; RV64-NEXT: li a0, 64 -; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: vsetvli zero, a3, e8, m4, ta, ma ; RV64-NEXT: vmand.mm v0, v14, v0 ; RV64-NEXT: vmv.v.i v12, 1 ; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 -; RV64-NEXT: vslidedown.vx v12, v8, a3 +; RV64-NEXT: vslidedown.vx v12, v8, a0 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vmul.vv v8, v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index a4a104abd2ef8..2ea618bf8a226 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -967,8 +967,8 @@ define i64 @vwreduce_add_v1i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsext.vf2 v9, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsext.vf2 v9, v8 ; RV32-NEXT: vsrl.vx v8, v9, a0 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vmv.x.s a0, v9 @@ -992,8 +992,8 @@ define i64 @vwreduce_uadd_v1i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vzext.vf2 v9, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vzext.vf2 v9, v8 ; RV32-NEXT: vsrl.vx v8, v9, a0 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vmv.x.s a0, v9 @@ -1020,9 +1020,9 @@ define i64 @vreduce_add_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1047,11 +1047,11 @@ define i64 @vwreduce_add_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1078,11 +1078,11 @@ define i64 @vwreduce_uadd_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1111,9 +1111,9 @@ define i64 @vreduce_add_v4i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1138,11 +1138,11 @@ define i64 @vwreduce_add_v4i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1169,11 +1169,11 @@ define i64 @vwreduce_uadd_v4i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1202,9 +1202,9 @@ define i64 @vreduce_add_v8i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v12 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1229,11 +1229,11 @@ define i64 @vwreduce_add_v8i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v10 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1260,11 +1260,11 @@ define i64 @vwreduce_uadd_v8i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v10 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1293,9 +1293,9 @@ define i64 @vreduce_add_v16i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1320,11 +1320,11 @@ define i64 @vwreduce_add_v16i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v12 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1351,11 +1351,11 @@ define i64 @vwreduce_uadd_v16i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v12 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1387,9 +1387,9 @@ define i64 @vreduce_add_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1494,21 +1494,21 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) define i64 @vreduce_add_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) -; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi a0, a0, 256 +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vadd.vv v24, v0, v24 +; RV32-NEXT: vmv.s.x v7, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: vredsum.vs v8, v8, v7 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1539,51 +1539,27 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind { define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vslidedown.vi v0, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v8, v24, v0 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwadd.vv v0, v8, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwadd.vv v24, v16, v8 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vwadd.vv v16, v0, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v16 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add sp, sp, a2 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_v64i64: @@ -1591,41 +1567,30 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vslidedown.vi v0, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwadd.vv v8, v24, v0 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwadd.vv v0, v8, v16 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwadd.vv v24, v8, v16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwadd.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -1640,51 +1605,27 @@ define i64 @vwreduce_add_v64i64(ptr %x) { define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_uadd_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vslidedown.vi v0, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v8, v24, v0 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwaddu.vv v0, v8, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwaddu.vv v24, v16, v8 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vwaddu.vv v16, v0, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v16 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add sp, sp, a2 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_v64i64: @@ -1692,41 +1633,30 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vslidedown.vi v0, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwaddu.vv v8, v24, v0 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwaddu.vv v0, v8, v16 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwaddu.vv v24, v8, v16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwaddu.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -2162,8 +2092,8 @@ define i64 @vreduce_and_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -2189,9 +2119,9 @@ define i64 @vreduce_and_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2216,9 +2146,9 @@ define i64 @vreduce_and_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2243,9 +2173,9 @@ define i64 @vreduce_and_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2272,10 +2202,10 @@ define i64 @vreduce_and_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2307,14 +2237,14 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vand.vv v16, v24, v16 -; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2324,9 +2254,9 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -2763,8 +2693,8 @@ define i64 @vreduce_or_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -2790,9 +2720,9 @@ define i64 @vreduce_or_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2817,9 +2747,9 @@ define i64 @vreduce_or_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2844,9 +2774,9 @@ define i64 @vreduce_or_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2873,10 +2803,10 @@ define i64 @vreduce_or_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2908,14 +2838,14 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vor.vv v16, v0, v16 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2925,9 +2855,9 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -3386,9 +3316,9 @@ define i64 @vreduce_xor_v2i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3415,9 +3345,9 @@ define i64 @vreduce_xor_v4i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3444,9 +3374,9 @@ define i64 @vreduce_xor_v8i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v12 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3473,9 +3403,9 @@ define i64 @vreduce_xor_v16i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3505,9 +3435,9 @@ define i64 @vreduce_xor_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: vxor.vv v8, v8, v16 ; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3534,21 +3464,21 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>) define i64 @vreduce_xor_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_xor_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) -; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vxor.vv v16, v24, v16 -; RV32-NEXT: vxor.vv v8, v8, v0 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi a0, a0, 256 +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vxor.vv v24, v0, v24 +; RV32-NEXT: vmv.s.x v7, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vxor.vv v8, v8, v16 -; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredxor.vs v8, v8, v16 +; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vredxor.vs v8, v8, v7 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -3999,8 +3929,8 @@ define i64 @vreduce_smin_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -4026,9 +3956,9 @@ define i64 @vreduce_smin_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4053,9 +3983,9 @@ define i64 @vreduce_smin_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4080,9 +4010,9 @@ define i64 @vreduce_smin_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4109,10 +4039,10 @@ define i64 @vreduce_smin_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmin.vv v8, v8, v16 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4144,14 +4074,14 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vmin.vv v16, v24, v16 -; RV32-NEXT: vmin.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmin.vv v16, v0, v16 +; RV32-NEXT: vmin.vv v8, v8, v24 ; RV32-NEXT: vmin.vv v8, v8, v16 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4161,9 +4091,9 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -4601,8 +4531,8 @@ define i64 @vreduce_smax_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -4628,9 +4558,9 @@ define i64 @vreduce_smax_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4655,9 +4585,9 @@ define i64 @vreduce_smax_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4682,9 +4612,9 @@ define i64 @vreduce_smax_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4711,10 +4641,10 @@ define i64 @vreduce_smax_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmax.vv v8, v8, v16 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4746,14 +4676,14 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vmax.vv v16, v24, v16 -; RV32-NEXT: vmax.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmax.vv v16, v0, v16 +; RV32-NEXT: vmax.vv v8, v8, v24 ; RV32-NEXT: vmax.vv v8, v8, v16 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -4763,9 +4693,9 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -5203,8 +5133,8 @@ define i64 @vreduce_umin_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -5230,9 +5160,9 @@ define i64 @vreduce_umin_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5257,9 +5187,9 @@ define i64 @vreduce_umin_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5284,9 +5214,9 @@ define i64 @vreduce_umin_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5313,10 +5243,10 @@ define i64 @vreduce_umin_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vminu.vv v8, v8, v16 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5348,14 +5278,14 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vminu.vv v16, v24, v16 -; RV32-NEXT: vminu.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vminu.vv v16, v0, v16 +; RV32-NEXT: vminu.vv v8, v8, v24 ; RV32-NEXT: vminu.vv v8, v8, v16 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5365,9 +5295,9 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -5804,8 +5734,8 @@ define i64 @vreduce_umax_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: li a0, 32 +; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v9, v8, a0 ; RV32-NEXT: vmv.x.s a1, v9 @@ -5831,9 +5761,9 @@ define i64 @vreduce_umax_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5858,9 +5788,9 @@ define i64 @vreduce_umax_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5885,9 +5815,9 @@ define i64 @vreduce_umax_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5914,10 +5844,10 @@ define i64 @vreduce_umax_v32i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmaxu.vv v8, v8, v16 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5949,14 +5879,14 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vmaxu.vv v16, v24, v16 -; RV32-NEXT: vmaxu.vv v8, v8, v0 +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmaxu.vv v16, v0, v16 +; RV32-NEXT: vmaxu.vv v8, v8, v24 ; RV32-NEXT: vmaxu.vv v8, v8, v16 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -5966,9 +5896,9 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: addi a2, a0, 384 -; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: vle64.v v0, (a1) @@ -6191,8 +6121,8 @@ define i8 @vreduce_mul_v256i8(ptr %x) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 32 @@ -6565,9 +6495,9 @@ define i64 @vreduce_mul_v2i64(ptr %x) { ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -6593,12 +6523,12 @@ define i64 @vreduce_mul_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: vrgather.vi v10, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -6626,6 +6556,7 @@ define i64 @vreduce_mul_v8i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vslidedown.vi v12, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 2 @@ -6633,7 +6564,6 @@ define i64 @vreduce_mul_v8i64(ptr %x) { ; RV32-NEXT: vrgather.vi v12, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -6663,6 +6593,7 @@ define i64 @vreduce_mul_v16i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 4 @@ -6672,7 +6603,6 @@ define i64 @vreduce_mul_v16i64(ptr %x) { ; RV32-NEXT: vrgather.vi v16, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll index dc0f4e7430555..ad358d7320240 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll @@ -231,16 +231,18 @@ declare i1 @llvm.vp.reduce.and.v256i1(i1, <256 x i1>, <256 x i1>, i32) define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v256i1: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bltu a1, a3, .LBB14_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: vmv1r.v v0, v11 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmnot.m v11, v0 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vcpop.m a2, v11, v0.t +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vcpop.m a2, v9, v0.t ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: and a0, a2, a0 ; CHECK-NEXT: addi a2, a1, -128 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index b27492b43cfbb..b8617fda3aa7e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -123,9 +123,9 @@ declare <16 x half> @llvm.vp.rint.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_rint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v16f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI6_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -357,9 +357,9 @@ declare <4 x double> @llvm.vp.rint.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_rint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -397,9 +397,9 @@ declare <8 x double> @llvm.vp.rint.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_rint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -437,9 +437,9 @@ declare <15 x double> @llvm.vp.rint.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_rint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -477,9 +477,9 @@ declare <16 x double> @llvm.vp.rint.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_rint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -532,26 +532,27 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t @@ -583,15 +584,15 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: addi a2, a0, -16 ; CHECK-NEXT: sltu a0, a0, a2 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v7, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index 315fe257626fd..820a05e3d6042 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -32,10 +32,10 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -72,10 +72,10 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -114,10 +114,10 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -154,10 +154,10 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -197,10 +197,10 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vmv1r.v v9, v0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v9, v12, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -238,10 +238,10 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.round.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -283,10 +283,10 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vmv1r.v v10, v0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v10, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -324,10 +324,10 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.round.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.round.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.round.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.round.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 4 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -808,27 +818,30 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: fsrmi a2, 4 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a1, 4 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: fsrm a2 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index b510532408cb8..8391c7939180a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -32,10 +32,10 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -72,10 +72,10 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -114,10 +114,10 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -154,10 +154,10 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -197,10 +197,10 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vmv1r.v v9, v0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v9, v12, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -238,10 +238,10 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.roundeven.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -283,10 +283,10 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; ZVFHMIN-NEXT: vmv1r.v v10, v0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v10, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -324,10 +324,10 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.roundeven.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.roundeven.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.roundeven.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.roundeven.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -808,27 +818,30 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: fsrm a2 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index d0a41a2bb968c..8c38d24460265 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -32,10 +32,10 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 @@ -72,10 +72,10 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -114,10 +114,10 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 @@ -154,10 +154,10 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -197,10 +197,10 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext ; ZVFHMIN-NEXT: vmv1r.v v9, v0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v9, v12, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 @@ -238,10 +238,10 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half>, <16 x i1>, i32) define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -283,10 +283,10 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer ; ZVFHMIN-NEXT: vmv1r.v v10, v0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v10, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 @@ -324,10 +324,10 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v15f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a1, 1 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -808,27 +818,30 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: lui a2, %hi(.LCPI27_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; CHECK-NEXT: addi a2, a0, -16 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a1, 1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v7, v24, fa5 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: fsrm a2 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll index 80561be0ca2f5..8da605d35270d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll @@ -102,37 +102,37 @@ define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vminu.vv v12, v8, v9 -; CHECK-NEXT: vmaxu.vv v8, v8, v9 -; CHECK-NEXT: vsub.vv v8, v8, v12 -; CHECK-NEXT: vminu.vv v9, v10, v11 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: vle8.v v13, (a1) -; CHECK-NEXT: vmaxu.vv v10, v10, v11 -; CHECK-NEXT: vsub.vv v9, v10, v9 -; CHECK-NEXT: vwaddu.vv v10, v9, v8 -; CHECK-NEXT: vminu.vv v8, v12, v13 -; CHECK-NEXT: vmaxu.vv v9, v12, v13 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a1, a1, a3 -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v13, (a1) -; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vminu.vv v14, v8, v9 +; CHECK-NEXT: vmaxu.vv v8, v8, v9 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsub.vv v8, v8, v14 +; CHECK-NEXT: vminu.vv v14, v10, v11 +; CHECK-NEXT: vmaxu.vv v10, v10, v11 +; CHECK-NEXT: vle8.v v11, (a1) +; CHECK-NEXT: vsub.vv v10, v10, v14 +; CHECK-NEXT: vminu.vv v14, v12, v13 +; CHECK-NEXT: vmaxu.vv v12, v12, v13 +; CHECK-NEXT: vwaddu.vv v16, v10, v8 +; CHECK-NEXT: vsub.vv v8, v12, v14 +; CHECK-NEXT: vminu.vv v10, v9, v11 +; CHECK-NEXT: vmaxu.vv v9, v9, v11 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vzext.vf2 v14, v8 -; CHECK-NEXT: vwaddu.vv v16, v14, v10 +; CHECK-NEXT: vzext.vf2 v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vminu.vv v8, v12, v13 -; CHECK-NEXT: vmaxu.vv v9, v12, v13 -; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsub.vv v8, v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vwaddu.vv v20, v12, v16 ; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vwaddu.wv v16, v16, v10 +; CHECK-NEXT: vwaddu.wv v20, v20, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v16, v8 +; CHECK-NEXT: vredsum.vs v8, v20, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll index 4621f339ca882..6b81b781a898f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll @@ -14,33 +14,31 @@ define <8 x float> @fpext_v8bf16(<8 x bfloat> %x) { ; CHECK-NEXT: fmv.x.w a6, fa6 ; CHECK-NEXT: fmv.x.w a7, fa7 ; CHECK-NEXT: slli a7, a7, 16 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v8, a7 ; CHECK-NEXT: slli a6, a6, 16 -; CHECK-NEXT: vmv.s.x v9, a6 -; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: slli a5, a5, 16 -; CHECK-NEXT: vmv.s.x v8, a5 ; CHECK-NEXT: slli a4, a4, 16 -; CHECK-NEXT: vmv.s.x v10, a4 -; CHECK-NEXT: vslideup.vi v10, v8, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v9, 2 ; CHECK-NEXT: slli a3, a3, 16 -; CHECK-NEXT: vmv.s.x v8, a3 ; CHECK-NEXT: slli a2, a2, 16 -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: slli a1, a1, 16 -; CHECK-NEXT: vmv.s.x v11, a1 ; CHECK-NEXT: slli a0, a0, 16 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.s.x v8, a7 +; CHECK-NEXT: vmv.s.x v9, a6 +; CHECK-NEXT: vmv.s.x v10, a5 +; CHECK-NEXT: vmv.s.x v12, a4 +; CHECK-NEXT: vmv.s.x v11, a3 +; CHECK-NEXT: vmv.s.x v13, a2 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.s.x v14, a1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vslideup.vi v13, v11, 1 ; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vslideup.vi v8, v11, 1 +; CHECK-NEXT: vslideup.vi v8, v14, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v12, v9, 2 +; CHECK-NEXT: vslideup.vi v8, v13, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret %y = fpext <8 x bfloat> %x to <8 x float> ret <8 x float> %y @@ -58,33 +56,31 @@ define <8 x float> @fpext_v8f16(<8 x bfloat> %x) { ; CHECK-NEXT: fmv.x.w a6, fa6 ; CHECK-NEXT: fmv.x.w a7, fa7 ; CHECK-NEXT: slli a7, a7, 16 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v8, a7 ; CHECK-NEXT: slli a6, a6, 16 -; CHECK-NEXT: vmv.s.x v9, a6 -; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: slli a5, a5, 16 -; CHECK-NEXT: vmv.s.x v8, a5 ; CHECK-NEXT: slli a4, a4, 16 -; CHECK-NEXT: vmv.s.x v10, a4 -; CHECK-NEXT: vslideup.vi v10, v8, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v9, 2 ; CHECK-NEXT: slli a3, a3, 16 -; CHECK-NEXT: vmv.s.x v8, a3 ; CHECK-NEXT: slli a2, a2, 16 -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: slli a1, a1, 16 -; CHECK-NEXT: vmv.s.x v11, a1 ; CHECK-NEXT: slli a0, a0, 16 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.s.x v8, a7 +; CHECK-NEXT: vmv.s.x v9, a6 +; CHECK-NEXT: vmv.s.x v10, a5 +; CHECK-NEXT: vmv.s.x v12, a4 +; CHECK-NEXT: vmv.s.x v11, a3 +; CHECK-NEXT: vmv.s.x v13, a2 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.s.x v14, a1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vslideup.vi v13, v11, 1 ; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vslideup.vi v8, v11, 1 +; CHECK-NEXT: vslideup.vi v8, v14, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v12, v9, 2 +; CHECK-NEXT: vslideup.vi v8, v13, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret %y = fpext <8 x bfloat> %x to <8 x float> ret <8 x float> %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 5c8be062649f5..03d5762b4903e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -411,9 +411,9 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v10, v12, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v10, v0.t @@ -438,9 +438,9 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v10, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v10, v12, v0.t @@ -492,15 +492,15 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -523,15 +523,15 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -575,9 +575,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v10, v12, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v10, v0.t @@ -602,9 +602,9 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v10, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v10, v12, v0.t @@ -1008,15 +1008,15 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1039,15 +1039,15 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1066,38 +1066,38 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFH-NEXT: slli a1, a1, 4 ; ZVFH-NEXT: sub sp, sp, a1 ; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; ZVFH-NEXT: addi a1, a0, 128 -; ZVFH-NEXT: li a3, 64 -; ZVFH-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; ZVFH-NEXT: vle16.v v24, (a1) ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: slli a1, a1, 3 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFH-NEXT: vle16.v v24, (a0) -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; ZVFH-NEXT: vslidedown.vi v6, v0, 8 +; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: addi a1, a0, 128 +; ZVFH-NEXT: li a3, 64 +; ZVFH-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; ZVFH-NEXT: vle16.v v16, (a1) +; ZVFH-NEXT: addi a1, sp, 16 +; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vle16.v v16, (a0) ; ZVFH-NEXT: mv a0, a2 +; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; ZVFH-NEXT: vslidedown.vi v24, v0, 8 ; ZVFH-NEXT: bltu a2, a3, .LBB43_2 ; ZVFH-NEXT: # %bb.1: ; ZVFH-NEXT: li a0, 64 ; ZVFH-NEXT: .LBB43_2: -; ZVFH-NEXT: addi a1, sp, 16 -; ZVFH-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH-NEXT: vmfeq.vv v7, v8, v24, v0.t +; ZVFH-NEXT: vmfeq.vv v7, v8, v16, v0.t ; ZVFH-NEXT: addi a0, a2, -64 ; ZVFH-NEXT: sltu a1, a2, a0 ; ZVFH-NEXT: addi a1, a1, -1 ; ZVFH-NEXT: and a0, a1, a0 -; ZVFH-NEXT: vmv1r.v v0, v6 +; ZVFH-NEXT: vmv1r.v v0, v24 ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: slli a1, a1, 3 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 +; ZVFH-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVFH-NEXT: addi a1, sp, 16 ; ZVFH-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v8, v16, v24, v0.t @@ -1114,1757 +1114,2269 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ; ZVFHMIN32-LABEL: fcmp_oeq_vv_v128f16: ; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -768 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 768 -; ZVFHMIN32-NEXT: sw ra, 764(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s0, 760(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s2, 756(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s3, 752(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: addi sp, sp, -896 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 896 +; ZVFHMIN32-NEXT: sw ra, 892(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s0, 888(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s2, 884(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s3, 880(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s4, 876(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s5, 872(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s6, 868(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s7, 864(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s8, 860(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s9, 856(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s10, 852(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s11, 848(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: .cfi_offset ra, -4 ; ZVFHMIN32-NEXT: .cfi_offset s0, -8 ; ZVFHMIN32-NEXT: .cfi_offset s2, -12 ; ZVFHMIN32-NEXT: .cfi_offset s3, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 768 +; ZVFHMIN32-NEXT: .cfi_offset s4, -20 +; ZVFHMIN32-NEXT: .cfi_offset s5, -24 +; ZVFHMIN32-NEXT: .cfi_offset s6, -28 +; ZVFHMIN32-NEXT: .cfi_offset s7, -32 +; ZVFHMIN32-NEXT: .cfi_offset s8, -36 +; ZVFHMIN32-NEXT: .cfi_offset s9, -40 +; ZVFHMIN32-NEXT: .cfi_offset s10, -44 +; ZVFHMIN32-NEXT: .cfi_offset s11, -48 +; ZVFHMIN32-NEXT: addi s0, sp, 896 ; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: li a2, 30 +; ZVFHMIN32-NEXT: mul a1, a1, a2 +; ZVFHMIN32-NEXT: sub sp, sp, a1 ; ZVFHMIN32-NEXT: andi sp, sp, -128 ; ZVFHMIN32-NEXT: addi a1, a0, 128 ; ZVFHMIN32-NEXT: li a2, 64 +; ZVFHMIN32-NEXT: addi a3, sp, 640 +; ZVFHMIN32-NEXT: addi a4, sp, 384 +; ZVFHMIN32-NEXT: addi a5, sp, 512 ; ZVFHMIN32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; ZVFHMIN32-NEXT: vle16.v v24, (a1) ; ZVFHMIN32-NEXT: vle16.v v0, (a0) -; ZVFHMIN32-NEXT: addi a0, sp, 512 -; ZVFHMIN32-NEXT: vse16.v v8, (a0) ; ZVFHMIN32-NEXT: addi a0, sp, 256 -; ZVFHMIN32-NEXT: vse16.v v0, (a0) -; ZVFHMIN32-NEXT: addi a0, sp, 384 -; ZVFHMIN32-NEXT: vse16.v v16, (a0) -; ZVFHMIN32-NEXT: addi a0, sp, 128 +; ZVFHMIN32-NEXT: vle16.v v24, (a1) +; ZVFHMIN32-NEXT: vse16.v v8, (a3) +; ZVFHMIN32-NEXT: vse16.v v0, (a4) +; ZVFHMIN32-NEXT: vse16.v v16, (a5) ; ZVFHMIN32-NEXT: vse16.v v24, (a0) +; ZVFHMIN32-NEXT: lh a0, 704(sp) +; ZVFHMIN32-NEXT: lh a1, 448(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 160(sp) +; ZVFHMIN32-NEXT: lh a0, 702(sp) +; ZVFHMIN32-NEXT: lh a1, 446(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 159(sp) +; ZVFHMIN32-NEXT: lh a0, 700(sp) +; ZVFHMIN32-NEXT: lh a1, 444(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 158(sp) +; ZVFHMIN32-NEXT: lh a0, 698(sp) +; ZVFHMIN32-NEXT: lh a1, 442(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 157(sp) +; ZVFHMIN32-NEXT: lh a0, 696(sp) +; ZVFHMIN32-NEXT: lh a1, 440(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 156(sp) +; ZVFHMIN32-NEXT: lh a0, 694(sp) +; ZVFHMIN32-NEXT: lh a1, 438(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 155(sp) +; ZVFHMIN32-NEXT: lh a0, 692(sp) +; ZVFHMIN32-NEXT: lh a1, 436(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 154(sp) +; ZVFHMIN32-NEXT: lh a0, 690(sp) +; ZVFHMIN32-NEXT: lh a1, 434(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 153(sp) +; ZVFHMIN32-NEXT: lh a0, 688(sp) +; ZVFHMIN32-NEXT: lh a1, 432(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 152(sp) +; ZVFHMIN32-NEXT: lh a0, 686(sp) +; ZVFHMIN32-NEXT: lh a1, 430(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 151(sp) +; ZVFHMIN32-NEXT: lh a0, 684(sp) +; ZVFHMIN32-NEXT: lh a1, 428(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 150(sp) +; ZVFHMIN32-NEXT: lh a0, 682(sp) +; ZVFHMIN32-NEXT: lh a1, 426(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 149(sp) +; ZVFHMIN32-NEXT: lh a0, 680(sp) +; ZVFHMIN32-NEXT: lh a1, 424(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 148(sp) +; ZVFHMIN32-NEXT: lh a0, 678(sp) +; ZVFHMIN32-NEXT: lh a1, 422(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 147(sp) +; ZVFHMIN32-NEXT: lh a0, 676(sp) +; ZVFHMIN32-NEXT: lh a1, 420(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 146(sp) +; ZVFHMIN32-NEXT: lh a0, 674(sp) +; ZVFHMIN32-NEXT: lh a1, 418(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: vmv.x.s a2, v0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: sb a0, 145(sp) +; ZVFHMIN32-NEXT: lh a0, 672(sp) +; ZVFHMIN32-NEXT: lh a1, 416(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a2, 128(sp) +; ZVFHMIN32-NEXT: sb a0, 144(sp) ; ZVFHMIN32-NEXT: lh a0, 576(sp) ; ZVFHMIN32-NEXT: lh a1, 320(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 32(sp) +; ZVFHMIN32-NEXT: sb a0, 224(sp) ; ZVFHMIN32-NEXT: lh a0, 574(sp) ; ZVFHMIN32-NEXT: lh a1, 318(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 31(sp) +; ZVFHMIN32-NEXT: sb a0, 223(sp) ; ZVFHMIN32-NEXT: lh a0, 572(sp) ; ZVFHMIN32-NEXT: lh a1, 316(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 30(sp) +; ZVFHMIN32-NEXT: sb a0, 222(sp) ; ZVFHMIN32-NEXT: lh a0, 570(sp) ; ZVFHMIN32-NEXT: lh a1, 314(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 29(sp) +; ZVFHMIN32-NEXT: sb a0, 221(sp) ; ZVFHMIN32-NEXT: lh a0, 568(sp) ; ZVFHMIN32-NEXT: lh a1, 312(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 28(sp) +; ZVFHMIN32-NEXT: sb a0, 220(sp) ; ZVFHMIN32-NEXT: lh a0, 566(sp) ; ZVFHMIN32-NEXT: lh a1, 310(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 27(sp) +; ZVFHMIN32-NEXT: sb a0, 219(sp) ; ZVFHMIN32-NEXT: lh a0, 564(sp) ; ZVFHMIN32-NEXT: lh a1, 308(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 26(sp) +; ZVFHMIN32-NEXT: sb a0, 218(sp) ; ZVFHMIN32-NEXT: lh a0, 562(sp) ; ZVFHMIN32-NEXT: lh a1, 306(sp) +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 29 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 6 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 28 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 5 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 27 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 4 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 26 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 3 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 25 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 1 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 23 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 1 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8 +; ZVFHMIN32-NEXT: vmv.x.s a4, v16 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 25(sp) +; ZVFHMIN32-NEXT: sb a0, 217(sp) ; ZVFHMIN32-NEXT: lh a0, 560(sp) ; ZVFHMIN32-NEXT: lh a1, 304(sp) +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 20 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 22 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 18 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 24(sp) +; ZVFHMIN32-NEXT: sb a0, 216(sp) ; ZVFHMIN32-NEXT: lh a0, 558(sp) ; ZVFHMIN32-NEXT: lh a1, 302(sp) +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 4 +; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 3 +; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 2 +; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 6 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 12 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 10 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN32-NEXT: addi a2, sp, 848 +; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s t4, v26 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 23(sp) +; ZVFHMIN32-NEXT: sb a0, 215(sp) ; ZVFHMIN32-NEXT: lh a0, 556(sp) ; ZVFHMIN32-NEXT: lh a1, 300(sp) +; ZVFHMIN32-NEXT: vmv.x.s t3, v20 +; ZVFHMIN32-NEXT: vmv.x.s t1, v28 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 22(sp) +; ZVFHMIN32-NEXT: sb a0, 214(sp) ; ZVFHMIN32-NEXT: lh a0, 554(sp) ; ZVFHMIN32-NEXT: lh a1, 298(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 1 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t2, v0 +; ZVFHMIN32-NEXT: vmv.x.s t0, v4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 21(sp) +; ZVFHMIN32-NEXT: sb a0, 213(sp) ; ZVFHMIN32-NEXT: lh a0, 552(sp) ; ZVFHMIN32-NEXT: lh a1, 296(sp) +; ZVFHMIN32-NEXT: vmv.x.s a7, v2 +; ZVFHMIN32-NEXT: vmv.x.s a6, v30 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 20(sp) +; ZVFHMIN32-NEXT: sb a0, 212(sp) ; ZVFHMIN32-NEXT: lh a0, 550(sp) ; ZVFHMIN32-NEXT: lh a1, 294(sp) +; ZVFHMIN32-NEXT: vmv.x.s a5, v22 +; ZVFHMIN32-NEXT: vmv.x.s a2, v18 +; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 19(sp) -; ZVFHMIN32-NEXT: lh a0, 548(sp) -; ZVFHMIN32-NEXT: lh a1, 292(sp) +; ZVFHMIN32-NEXT: sb a0, 211(sp) +; ZVFHMIN32-NEXT: lh a1, 548(sp) +; ZVFHMIN32-NEXT: lh t5, 292(sp) +; ZVFHMIN32-NEXT: vmv.x.s a0, v14 +; ZVFHMIN32-NEXT: sw a0, 116(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: sw a0, 124(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 210(sp) +; ZVFHMIN32-NEXT: lh a1, 546(sp) +; ZVFHMIN32-NEXT: lh t5, 290(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: vmv.x.s a4, v24 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN32-NEXT: sb a1, 209(sp) +; ZVFHMIN32-NEXT: lh a1, 544(sp) +; ZVFHMIN32-NEXT: lh t5, 288(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a4, 192(sp) +; ZVFHMIN32-NEXT: sb a1, 208(sp) +; ZVFHMIN32-NEXT: lh t5, 738(sp) +; ZVFHMIN32-NEXT: lh t6, 482(sp) +; ZVFHMIN32-NEXT: vmv.x.s a0, v12 +; ZVFHMIN32-NEXT: sw a0, 108(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s a0, v10 +; ZVFHMIN32-NEXT: sw a0, 120(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN32-NEXT: sb t5, 177(sp) +; ZVFHMIN32-NEXT: lh t5, 736(sp) +; ZVFHMIN32-NEXT: lh t6, 480(sp) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 29 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s5, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 28 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s6, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN32-NEXT: sb t5, 176(sp) +; ZVFHMIN32-NEXT: lh t5, 734(sp) +; ZVFHMIN32-NEXT: lh t6, 478(sp) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 27 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s7, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 26 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s8, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN32-NEXT: sb t5, 175(sp) +; ZVFHMIN32-NEXT: lh t5, 732(sp) +; ZVFHMIN32-NEXT: lh t6, 476(sp) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 25 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s4, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 24 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s3, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN32-NEXT: sb t5, 174(sp) +; ZVFHMIN32-NEXT: lh t6, 730(sp) +; ZVFHMIN32-NEXT: lh s9, 474(sp) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a1, 23 +; ZVFHMIN32-NEXT: mul a0, a0, a1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh s2, 848(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t5, v3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4 +; ZVFHMIN32-NEXT: sb t6, 173(sp) +; ZVFHMIN32-NEXT: lh s9, 728(sp) +; ZVFHMIN32-NEXT: lh s10, 472(sp) +; ZVFHMIN32-NEXT: vmv.x.s t6, v31 +; ZVFHMIN32-NEXT: vmv.x.s ra, v13 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 +; ZVFHMIN32-NEXT: sb s9, 172(sp) +; ZVFHMIN32-NEXT: lh s9, 726(sp) +; ZVFHMIN32-NEXT: lh s10, 470(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v29 +; ZVFHMIN32-NEXT: vmv.x.s a3, v11 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 +; ZVFHMIN32-NEXT: sb s9, 171(sp) +; ZVFHMIN32-NEXT: lh s10, 724(sp) +; ZVFHMIN32-NEXT: lh s11, 468(sp) +; ZVFHMIN32-NEXT: vmv.x.s a4, v7 +; ZVFHMIN32-NEXT: vmv.x.s s9, v9 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s11 +; ZVFHMIN32-NEXT: feq.h s10, fa5, fa4 +; ZVFHMIN32-NEXT: sb s10, 170(sp) +; ZVFHMIN32-NEXT: lh a0, 722(sp) +; ZVFHMIN32-NEXT: lh a1, 466(sp) +; ZVFHMIN32-NEXT: vmv.x.s s10, v21 +; ZVFHMIN32-NEXT: vmv.x.s s11, v27 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 18(sp) -; ZVFHMIN32-NEXT: lh a0, 546(sp) -; ZVFHMIN32-NEXT: lh a1, 290(sp) +; ZVFHMIN32-NEXT: sb a0, 169(sp) +; ZVFHMIN32-NEXT: lh a0, 720(sp) +; ZVFHMIN32-NEXT: lh a1, 464(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 168(sp) +; ZVFHMIN32-NEXT: lh a0, 718(sp) +; ZVFHMIN32-NEXT: lh a1, 462(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa3, s7 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s8 +; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa1, ra +; ZVFHMIN32-NEXT: sb a0, 167(sp) +; ZVFHMIN32-NEXT: lh a0, 716(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN32-NEXT: lh a1, 460(sp) +; ZVFHMIN32-NEXT: feq.h s5, fa5, fa1 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN32-NEXT: sb a1, 166(sp) +; ZVFHMIN32-NEXT: lh a1, 714(sp) +; ZVFHMIN32-NEXT: lh a2, 458(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h a3, fa3, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN32-NEXT: sb a1, 165(sp) +; ZVFHMIN32-NEXT: lh a1, 712(sp) +; ZVFHMIN32-NEXT: lh a2, 456(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN32-NEXT: feq.h a4, fa2, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa3, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, s2 +; ZVFHMIN32-NEXT: sb a1, 164(sp) +; ZVFHMIN32-NEXT: lh a1, 710(sp) +; ZVFHMIN32-NEXT: lh a2, 454(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, s9 +; ZVFHMIN32-NEXT: feq.h s2, fa5, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN32-NEXT: sb a1, 163(sp) +; ZVFHMIN32-NEXT: lh a1, 708(sp) +; ZVFHMIN32-NEXT: lh a2, 452(sp) +; ZVFHMIN32-NEXT: feq.h s3, fa4, fa5 +; ZVFHMIN32-NEXT: feq.h s4, fa3, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 162(sp) +; ZVFHMIN32-NEXT: lh a1, 706(sp) +; ZVFHMIN32-NEXT: lh a2, 450(sp) +; ZVFHMIN32-NEXT: sb s4, 129(sp) +; ZVFHMIN32-NEXT: sb s3, 130(sp) +; ZVFHMIN32-NEXT: sb s2, 131(sp) +; ZVFHMIN32-NEXT: sb a4, 132(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a3, 133(sp) +; ZVFHMIN32-NEXT: sb a0, 134(sp) +; ZVFHMIN32-NEXT: sb s5, 135(sp) +; ZVFHMIN32-NEXT: sb a1, 161(sp) +; ZVFHMIN32-NEXT: lh a0, 610(sp) +; ZVFHMIN32-NEXT: lh a1, 354(sp) +; ZVFHMIN32-NEXT: vmv.x.s s6, v5 +; ZVFHMIN32-NEXT: vmv.x.s s5, v23 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 17(sp) -; ZVFHMIN32-NEXT: lh a0, 544(sp) -; ZVFHMIN32-NEXT: lh a1, 288(sp) +; ZVFHMIN32-NEXT: sb a0, 241(sp) +; ZVFHMIN32-NEXT: lh a0, 608(sp) +; ZVFHMIN32-NEXT: lh a1, 352(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 21 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 20 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: vmv.x.s a1, v0 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: sb a0, 240(sp) +; ZVFHMIN32-NEXT: lh a0, 606(sp) +; ZVFHMIN32-NEXT: lh a1, 350(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 22 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN32-NEXT: sb a0, 239(sp) +; ZVFHMIN32-NEXT: lh a0, 604(sp) +; ZVFHMIN32-NEXT: lh a1, 348(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 238(sp) +; ZVFHMIN32-NEXT: lh a0, 602(sp) +; ZVFHMIN32-NEXT: lh a1, 346(sp) +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 237(sp) +; ZVFHMIN32-NEXT: lh a0, 600(sp) +; ZVFHMIN32-NEXT: lh a1, 344(sp) +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 236(sp) +; ZVFHMIN32-NEXT: lh a0, 598(sp) +; ZVFHMIN32-NEXT: lh a1, 342(sp) +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 235(sp) +; ZVFHMIN32-NEXT: lh a0, 596(sp) +; ZVFHMIN32-NEXT: lh a1, 340(sp) +; ZVFHMIN32-NEXT: vmv.x.s s8, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 234(sp) +; ZVFHMIN32-NEXT: lh a0, 594(sp) +; ZVFHMIN32-NEXT: lh a1, 338(sp) +; ZVFHMIN32-NEXT: vmv.x.s s9, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: sb a0, 233(sp) +; ZVFHMIN32-NEXT: lh a0, 592(sp) ; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: lh t5, 336(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN32-NEXT: vmv.x.s s7, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa2, t5 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN32-NEXT: sb a0, 232(sp) +; ZVFHMIN32-NEXT: lh a0, 590(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa2, a3 +; ZVFHMIN32-NEXT: lh a2, 334(sp) +; ZVFHMIN32-NEXT: feq.h t5, fa5, fa3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: feq.h t6, fa4, fa2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN32-NEXT: sb a0, 231(sp) +; ZVFHMIN32-NEXT: lh a0, 588(sp) +; ZVFHMIN32-NEXT: lh a2, 332(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN32-NEXT: sb a0, 230(sp) +; ZVFHMIN32-NEXT: lh a0, 586(sp) +; ZVFHMIN32-NEXT: lh a2, 330(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s8 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN32-NEXT: sb a0, 229(sp) +; ZVFHMIN32-NEXT: lh a0, 584(sp) +; ZVFHMIN32-NEXT: lh a2, 328(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN32-NEXT: feq.h s4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN32-NEXT: sb a0, 228(sp) +; ZVFHMIN32-NEXT: lh a0, 582(sp) +; ZVFHMIN32-NEXT: lh a2, 326(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: sb a1, 0(sp) -; ZVFHMIN32-NEXT: sb a0, 16(sp) -; ZVFHMIN32-NEXT: lh a0, 448(sp) -; ZVFHMIN32-NEXT: lh a1, 192(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN32-NEXT: sb a0, 227(sp) +; ZVFHMIN32-NEXT: lh a0, 580(sp) +; ZVFHMIN32-NEXT: lh a2, 324(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s7 +; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 226(sp) +; ZVFHMIN32-NEXT: lh a0, 578(sp) +; ZVFHMIN32-NEXT: lh a2, 322(sp) +; ZVFHMIN32-NEXT: sb s2, 193(sp) +; ZVFHMIN32-NEXT: sb a1, 194(sp) +; ZVFHMIN32-NEXT: sb s4, 195(sp) +; ZVFHMIN32-NEXT: sb a4, 196(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a3, 197(sp) +; ZVFHMIN32-NEXT: sb t6, 198(sp) +; ZVFHMIN32-NEXT: sb t5, 199(sp) +; ZVFHMIN32-NEXT: sb a0, 225(sp) +; ZVFHMIN32-NEXT: lh a0, 766(sp) +; ZVFHMIN32-NEXT: lh a1, 510(sp) +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 18 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s2, v8 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: mul a2, a2, a3 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s t6, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 191(sp) +; ZVFHMIN32-NEXT: lh a0, 764(sp) +; ZVFHMIN32-NEXT: lh a1, 508(sp) +; ZVFHMIN32-NEXT: vmv.x.s t5, v6 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: addi a2, a2, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 96(sp) -; ZVFHMIN32-NEXT: lh a0, 446(sp) -; ZVFHMIN32-NEXT: lh a1, 190(sp) +; ZVFHMIN32-NEXT: sb a0, 190(sp) +; ZVFHMIN32-NEXT: lh a0, 762(sp) +; ZVFHMIN32-NEXT: lh a1, 506(sp) +; ZVFHMIN32-NEXT: csrr a3, vlenb +; ZVFHMIN32-NEXT: slli a3, a3, 3 +; ZVFHMIN32-NEXT: add a3, sp, a3 +; ZVFHMIN32-NEXT: addi a3, a3, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a3, v8 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: li s3, 6 +; ZVFHMIN32-NEXT: mul a4, a4, s3 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a4, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 95(sp) -; ZVFHMIN32-NEXT: lh a0, 444(sp) -; ZVFHMIN32-NEXT: lh a1, 188(sp) +; ZVFHMIN32-NEXT: sb a0, 189(sp) +; ZVFHMIN32-NEXT: lh a0, 760(sp) +; ZVFHMIN32-NEXT: lh a1, 504(sp) +; ZVFHMIN32-NEXT: csrr s3, vlenb +; ZVFHMIN32-NEXT: li s4, 12 +; ZVFHMIN32-NEXT: mul s3, s3, s4 +; ZVFHMIN32-NEXT: add s3, sp, s3 +; ZVFHMIN32-NEXT: addi s3, s3, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s6, v8 +; ZVFHMIN32-NEXT: csrr s3, vlenb +; ZVFHMIN32-NEXT: li s4, 10 +; ZVFHMIN32-NEXT: mul s3, s3, s4 +; ZVFHMIN32-NEXT: add s3, sp, s3 +; ZVFHMIN32-NEXT: addi s3, s3, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s4, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 94(sp) -; ZVFHMIN32-NEXT: lh a0, 442(sp) -; ZVFHMIN32-NEXT: lh a1, 186(sp) +; ZVFHMIN32-NEXT: sb a0, 188(sp) +; ZVFHMIN32-NEXT: lh a0, 758(sp) +; ZVFHMIN32-NEXT: lh a1, 502(sp) +; ZVFHMIN32-NEXT: csrr s3, vlenb +; ZVFHMIN32-NEXT: slli s3, s3, 4 +; ZVFHMIN32-NEXT: add s3, sp, s3 +; ZVFHMIN32-NEXT: addi s3, s3, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s5, v8 +; ZVFHMIN32-NEXT: vmv.x.s s3, v16 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 93(sp) -; ZVFHMIN32-NEXT: lh a0, 440(sp) -; ZVFHMIN32-NEXT: lh a1, 184(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN32-NEXT: sb a0, 187(sp) +; ZVFHMIN32-NEXT: lh a0, 756(sp) +; ZVFHMIN32-NEXT: lh a1, 500(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 92(sp) -; ZVFHMIN32-NEXT: lh a0, 438(sp) -; ZVFHMIN32-NEXT: lh a1, 182(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN32-NEXT: sb a0, 186(sp) +; ZVFHMIN32-NEXT: lh a0, 754(sp) +; ZVFHMIN32-NEXT: lh a1, 498(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 91(sp) -; ZVFHMIN32-NEXT: lh a0, 436(sp) -; ZVFHMIN32-NEXT: lh a1, 180(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 +; ZVFHMIN32-NEXT: sb a0, 185(sp) +; ZVFHMIN32-NEXT: lh a0, 752(sp) +; ZVFHMIN32-NEXT: lh a1, 496(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 90(sp) -; ZVFHMIN32-NEXT: lh a0, 434(sp) -; ZVFHMIN32-NEXT: lh a1, 178(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 +; ZVFHMIN32-NEXT: sb a0, 184(sp) +; ZVFHMIN32-NEXT: lh a0, 750(sp) +; ZVFHMIN32-NEXT: lh a1, 494(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 89(sp) -; ZVFHMIN32-NEXT: lh a0, 432(sp) -; ZVFHMIN32-NEXT: lh a1, 176(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 +; ZVFHMIN32-NEXT: sb a0, 183(sp) +; ZVFHMIN32-NEXT: lh a0, 748(sp) +; ZVFHMIN32-NEXT: lh a1, 492(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 88(sp) -; ZVFHMIN32-NEXT: lh a0, 430(sp) -; ZVFHMIN32-NEXT: lh a1, 174(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 +; ZVFHMIN32-NEXT: sb a0, 182(sp) +; ZVFHMIN32-NEXT: lh a0, 746(sp) +; ZVFHMIN32-NEXT: lh a1, 490(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s5 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 87(sp) -; ZVFHMIN32-NEXT: lh a0, 428(sp) -; ZVFHMIN32-NEXT: lh a1, 172(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 +; ZVFHMIN32-NEXT: sb a0, 181(sp) +; ZVFHMIN32-NEXT: lh a0, 744(sp) +; ZVFHMIN32-NEXT: lh a1, 488(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 86(sp) -; ZVFHMIN32-NEXT: lh a0, 426(sp) -; ZVFHMIN32-NEXT: lh a1, 170(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 +; ZVFHMIN32-NEXT: addi a1, sp, 848 +; ZVFHMIN32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 +; ZVFHMIN32-NEXT: sb a0, 180(sp) +; ZVFHMIN32-NEXT: lh a0, 742(sp) +; ZVFHMIN32-NEXT: lh a7, 486(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 179(sp) +; ZVFHMIN32-NEXT: lh a0, 740(sp) +; ZVFHMIN32-NEXT: lh a7, 484(sp) +; ZVFHMIN32-NEXT: sb a2, 140(sp) +; ZVFHMIN32-NEXT: sb t1, 141(sp) +; ZVFHMIN32-NEXT: sb t3, 142(sp) +; ZVFHMIN32-NEXT: sb t4, 143(sp) +; ZVFHMIN32-NEXT: sb a1, 136(sp) +; ZVFHMIN32-NEXT: sb a6, 137(sp) +; ZVFHMIN32-NEXT: sb a4, 138(sp) +; ZVFHMIN32-NEXT: sb a3, 139(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 178(sp) +; ZVFHMIN32-NEXT: lh a0, 638(sp) +; ZVFHMIN32-NEXT: lh a1, 382(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 +; ZVFHMIN32-NEXT: vmv.x.s t3, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 85(sp) -; ZVFHMIN32-NEXT: lh a0, 424(sp) -; ZVFHMIN32-NEXT: lh a1, 168(sp) +; ZVFHMIN32-NEXT: sb a0, 255(sp) +; ZVFHMIN32-NEXT: lh a0, 636(sp) +; ZVFHMIN32-NEXT: lh a1, 380(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 +; ZVFHMIN32-NEXT: vmv.x.s t2, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 84(sp) -; ZVFHMIN32-NEXT: lh a0, 422(sp) -; ZVFHMIN32-NEXT: lh a1, 166(sp) +; ZVFHMIN32-NEXT: sb a0, 254(sp) +; ZVFHMIN32-NEXT: lh a0, 634(sp) +; ZVFHMIN32-NEXT: lh a1, 378(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN32-NEXT: vmv.x.s t1, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 83(sp) -; ZVFHMIN32-NEXT: lh a0, 420(sp) -; ZVFHMIN32-NEXT: lh a1, 164(sp) +; ZVFHMIN32-NEXT: sb a0, 253(sp) +; ZVFHMIN32-NEXT: lh a0, 632(sp) +; ZVFHMIN32-NEXT: lh a1, 376(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN32-NEXT: vmv.x.s t0, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 82(sp) -; ZVFHMIN32-NEXT: lh a0, 418(sp) -; ZVFHMIN32-NEXT: lh a1, 162(sp) +; ZVFHMIN32-NEXT: sb a0, 252(sp) +; ZVFHMIN32-NEXT: lh a0, 630(sp) +; ZVFHMIN32-NEXT: lh a1, 374(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN32-NEXT: vmv.x.s a7, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 81(sp) -; ZVFHMIN32-NEXT: lh a0, 416(sp) -; ZVFHMIN32-NEXT: lh a1, 160(sp) +; ZVFHMIN32-NEXT: sb a0, 251(sp) +; ZVFHMIN32-NEXT: lh a0, 628(sp) +; ZVFHMIN32-NEXT: lh a1, 372(sp) +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN32-NEXT: vmv.x.s a6, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: vmv.x.s a1, v24 +; ZVFHMIN32-NEXT: lw a1, 112(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: vmv.x.s a1, v16 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: sb a1, 64(sp) -; ZVFHMIN32-NEXT: sb a0, 80(sp) -; ZVFHMIN32-NEXT: lh a0, 610(sp) -; ZVFHMIN32-NEXT: lh a1, 354(sp) +; ZVFHMIN32-NEXT: sb a0, 250(sp) +; ZVFHMIN32-NEXT: lh a0, 626(sp) +; ZVFHMIN32-NEXT: lh a1, 370(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 49(sp) -; ZVFHMIN32-NEXT: lh a0, 608(sp) -; ZVFHMIN32-NEXT: lh a1, 352(sp) +; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: sb a0, 249(sp) +; ZVFHMIN32-NEXT: lh a0, 624(sp) +; ZVFHMIN32-NEXT: lh a1, 368(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 48(sp) -; ZVFHMIN32-NEXT: lh a0, 606(sp) -; ZVFHMIN32-NEXT: lh a1, 350(sp) +; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: sb a0, 248(sp) +; ZVFHMIN32-NEXT: lh a0, 622(sp) +; ZVFHMIN32-NEXT: lh a1, 366(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 47(sp) -; ZVFHMIN32-NEXT: lh a1, 604(sp) -; ZVFHMIN32-NEXT: lh a2, 348(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 7 -; ZVFHMIN32-NEXT: vmv.x.s a0, v10 +; ZVFHMIN32-NEXT: lw a1, 108(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 46(sp) -; ZVFHMIN32-NEXT: lh a2, 602(sp) -; ZVFHMIN32-NEXT: lh a3, 346(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 -; ZVFHMIN32-NEXT: vmv.x.s a1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 45(sp) -; ZVFHMIN32-NEXT: lh a3, 600(sp) -; ZVFHMIN32-NEXT: lh a4, 344(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 6 -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 44(sp) -; ZVFHMIN32-NEXT: lh a4, 598(sp) -; ZVFHMIN32-NEXT: lh a5, 342(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 6 -; ZVFHMIN32-NEXT: vmv.x.s a3, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 43(sp) -; ZVFHMIN32-NEXT: lh a5, 596(sp) -; ZVFHMIN32-NEXT: lh a6, 340(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 5 -; ZVFHMIN32-NEXT: vmv.x.s a4, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN32-NEXT: sb a5, 42(sp) -; ZVFHMIN32-NEXT: lh a6, 594(sp) -; ZVFHMIN32-NEXT: lh a7, 338(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 5 -; ZVFHMIN32-NEXT: vmv.x.s a5, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: sb a6, 41(sp) -; ZVFHMIN32-NEXT: lh a7, 592(sp) -; ZVFHMIN32-NEXT: lh t0, 336(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 4 -; ZVFHMIN32-NEXT: vmv.x.s a6, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: sb a7, 40(sp) -; ZVFHMIN32-NEXT: lh t0, 590(sp) -; ZVFHMIN32-NEXT: lh t1, 334(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 4 -; ZVFHMIN32-NEXT: vmv.x.s a7, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN32-NEXT: sb t0, 39(sp) -; ZVFHMIN32-NEXT: lh t1, 588(sp) -; ZVFHMIN32-NEXT: lh t2, 332(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 3 -; ZVFHMIN32-NEXT: vmv.x.s t0, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN32-NEXT: sb t1, 38(sp) -; ZVFHMIN32-NEXT: lh t2, 586(sp) -; ZVFHMIN32-NEXT: lh t3, 330(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 3 -; ZVFHMIN32-NEXT: vmv.x.s t1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 37(sp) -; ZVFHMIN32-NEXT: lh t2, 584(sp) -; ZVFHMIN32-NEXT: lh t3, 328(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 2 -; ZVFHMIN32-NEXT: vmv.x.s t4, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 36(sp) -; ZVFHMIN32-NEXT: lh t2, 582(sp) -; ZVFHMIN32-NEXT: lh t3, 326(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 2 -; ZVFHMIN32-NEXT: vmv.x.s t5, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 35(sp) -; ZVFHMIN32-NEXT: lh t2, 580(sp) -; ZVFHMIN32-NEXT: lh t3, 324(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 1 -; ZVFHMIN32-NEXT: vmv.x.s t6, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 34(sp) -; ZVFHMIN32-NEXT: lh t2, 578(sp) -; ZVFHMIN32-NEXT: lh t3, 322(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 1 -; ZVFHMIN32-NEXT: vmv.x.s s2, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: sb a2, 5(sp) -; ZVFHMIN32-NEXT: sb a1, 6(sp) -; ZVFHMIN32-NEXT: sb a0, 7(sp) -; ZVFHMIN32-NEXT: sb t2, 33(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 1(sp) -; ZVFHMIN32-NEXT: sb a2, 2(sp) -; ZVFHMIN32-NEXT: sb a1, 3(sp) -; ZVFHMIN32-NEXT: sb a0, 4(sp) -; ZVFHMIN32-NEXT: lh a0, 482(sp) -; ZVFHMIN32-NEXT: lh a1, 226(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 113(sp) -; ZVFHMIN32-NEXT: lh a0, 480(sp) -; ZVFHMIN32-NEXT: lh a1, 224(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 112(sp) -; ZVFHMIN32-NEXT: lh a0, 478(sp) -; ZVFHMIN32-NEXT: lh a1, 222(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 111(sp) -; ZVFHMIN32-NEXT: lh a1, 476(sp) -; ZVFHMIN32-NEXT: lh a2, 220(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 7 -; ZVFHMIN32-NEXT: vmv.x.s a0, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a1, 110(sp) -; ZVFHMIN32-NEXT: lh a2, 474(sp) -; ZVFHMIN32-NEXT: lh a3, 218(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 7 -; ZVFHMIN32-NEXT: vmv.x.s a1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 109(sp) -; ZVFHMIN32-NEXT: lh a3, 472(sp) -; ZVFHMIN32-NEXT: lh a4, 216(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 6 -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 108(sp) -; ZVFHMIN32-NEXT: lh a4, 470(sp) -; ZVFHMIN32-NEXT: lh a5, 214(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 6 -; ZVFHMIN32-NEXT: vmv.x.s a3, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 107(sp) -; ZVFHMIN32-NEXT: lh a5, 468(sp) -; ZVFHMIN32-NEXT: lh a6, 212(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 5 -; ZVFHMIN32-NEXT: vmv.x.s a4, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN32-NEXT: sb a5, 106(sp) -; ZVFHMIN32-NEXT: lh a6, 466(sp) -; ZVFHMIN32-NEXT: lh a7, 210(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 5 -; ZVFHMIN32-NEXT: vmv.x.s a5, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: sb a6, 105(sp) -; ZVFHMIN32-NEXT: lh a7, 464(sp) -; ZVFHMIN32-NEXT: lh t0, 208(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 4 -; ZVFHMIN32-NEXT: vmv.x.s a6, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN32-NEXT: sb a7, 104(sp) -; ZVFHMIN32-NEXT: lh t0, 462(sp) -; ZVFHMIN32-NEXT: lh t1, 206(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 4 -; ZVFHMIN32-NEXT: vmv.x.s a7, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN32-NEXT: sb t0, 103(sp) -; ZVFHMIN32-NEXT: lh t1, 460(sp) -; ZVFHMIN32-NEXT: lh t2, 204(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 3 -; ZVFHMIN32-NEXT: vmv.x.s t0, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN32-NEXT: sb t1, 102(sp) -; ZVFHMIN32-NEXT: lh t2, 458(sp) -; ZVFHMIN32-NEXT: lh t3, 202(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 3 -; ZVFHMIN32-NEXT: vmv.x.s t1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 101(sp) -; ZVFHMIN32-NEXT: lh t2, 456(sp) -; ZVFHMIN32-NEXT: lh t3, 200(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 2 -; ZVFHMIN32-NEXT: vmv.x.s t4, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 100(sp) -; ZVFHMIN32-NEXT: lh t2, 454(sp) -; ZVFHMIN32-NEXT: lh t3, 198(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 2 -; ZVFHMIN32-NEXT: vmv.x.s t5, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 99(sp) -; ZVFHMIN32-NEXT: lh t2, 452(sp) -; ZVFHMIN32-NEXT: lh t3, 196(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 1 -; ZVFHMIN32-NEXT: vmv.x.s t6, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: sb t2, 98(sp) -; ZVFHMIN32-NEXT: lh t2, 450(sp) -; ZVFHMIN32-NEXT: lh t3, 194(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 1 -; ZVFHMIN32-NEXT: vmv.x.s s2, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: sb a2, 69(sp) -; ZVFHMIN32-NEXT: sb a1, 70(sp) -; ZVFHMIN32-NEXT: sb a0, 71(sp) -; ZVFHMIN32-NEXT: sb t2, 97(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 65(sp) -; ZVFHMIN32-NEXT: sb a2, 66(sp) -; ZVFHMIN32-NEXT: sb a1, 67(sp) -; ZVFHMIN32-NEXT: sb a0, 68(sp) -; ZVFHMIN32-NEXT: lh a0, 638(sp) -; ZVFHMIN32-NEXT: lh a1, 382(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 63(sp) -; ZVFHMIN32-NEXT: lh a0, 636(sp) -; ZVFHMIN32-NEXT: lh a1, 380(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 62(sp) -; ZVFHMIN32-NEXT: lh a0, 634(sp) -; ZVFHMIN32-NEXT: lh a1, 378(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 61(sp) -; ZVFHMIN32-NEXT: lh a0, 632(sp) -; ZVFHMIN32-NEXT: lh a1, 376(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 60(sp) -; ZVFHMIN32-NEXT: lh a0, 630(sp) -; ZVFHMIN32-NEXT: lh a1, 374(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 59(sp) -; ZVFHMIN32-NEXT: lh a0, 628(sp) -; ZVFHMIN32-NEXT: lh a1, 372(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 58(sp) -; ZVFHMIN32-NEXT: lh a0, 626(sp) -; ZVFHMIN32-NEXT: lh a1, 370(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 57(sp) -; ZVFHMIN32-NEXT: lh a0, 624(sp) -; ZVFHMIN32-NEXT: lh a1, 368(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 56(sp) -; ZVFHMIN32-NEXT: lh a0, 622(sp) -; ZVFHMIN32-NEXT: lh a1, 366(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v20, v0, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v0, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v26, v0, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v28, v0, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v18, v0, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v12, v0, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v14, v0, 8 -; ZVFHMIN32-NEXT: vmv.x.s a2, v20 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 55(sp) +; ZVFHMIN32-NEXT: sb a0, 247(sp) ; ZVFHMIN32-NEXT: lh a0, 620(sp) ; ZVFHMIN32-NEXT: lh a1, 364(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 15 -; ZVFHMIN32-NEXT: vmv.x.s a3, v20 -; ZVFHMIN32-NEXT: vmv.x.s a4, v22 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 +; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 54(sp) +; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: sb a0, 246(sp) ; ZVFHMIN32-NEXT: lh a0, 618(sp) ; ZVFHMIN32-NEXT: lh a1, 362(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN32-NEXT: vmv.x.s a5, v20 -; ZVFHMIN32-NEXT: vmv.x.s a6, v26 +; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 +; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 53(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN32-NEXT: sb a0, 245(sp) ; ZVFHMIN32-NEXT: lh a0, 616(sp) ; ZVFHMIN32-NEXT: lh a1, 360(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 13 -; ZVFHMIN32-NEXT: vmv.x.s a7, v20 -; ZVFHMIN32-NEXT: vmv.x.s t0, v28 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 52(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN32-NEXT: sb a0, 244(sp) ; ZVFHMIN32-NEXT: lh a0, 614(sp) ; ZVFHMIN32-NEXT: lh a1, 358(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 12 -; ZVFHMIN32-NEXT: vmv.x.s t1, v20 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 +; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 51(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8 +; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: sb a0, 243(sp) ; ZVFHMIN32-NEXT: lh a0, 612(sp) ; ZVFHMIN32-NEXT: lh a1, 356(sp) -; ZVFHMIN32-NEXT: vmv.x.s t2, v18 -; ZVFHMIN32-NEXT: vslidedown.vi v18, v8, 11 -; ZVFHMIN32-NEXT: vmv.x.s t3, v18 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 50(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 12(sp) -; ZVFHMIN32-NEXT: sb a2, 13(sp) -; ZVFHMIN32-NEXT: sb a1, 14(sp) -; ZVFHMIN32-NEXT: sb a0, 15(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: vmv.x.s a1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 10 -; ZVFHMIN32-NEXT: vmv.x.s a1, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: vmv.x.s a2, v12 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 9 -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: vmv.x.s a3, v14 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v8, 8 -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 8(sp) -; ZVFHMIN32-NEXT: sb a2, 9(sp) -; ZVFHMIN32-NEXT: sb a1, 10(sp) -; ZVFHMIN32-NEXT: sb a0, 11(sp) -; ZVFHMIN32-NEXT: lh a0, 510(sp) -; ZVFHMIN32-NEXT: lh a1, 254(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 127(sp) -; ZVFHMIN32-NEXT: lh a0, 508(sp) -; ZVFHMIN32-NEXT: lh a1, 252(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 126(sp) -; ZVFHMIN32-NEXT: lh a0, 506(sp) -; ZVFHMIN32-NEXT: lh a1, 250(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 125(sp) -; ZVFHMIN32-NEXT: lh a0, 504(sp) -; ZVFHMIN32-NEXT: lh a1, 248(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 124(sp) -; ZVFHMIN32-NEXT: lh a0, 502(sp) -; ZVFHMIN32-NEXT: lh a1, 246(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 123(sp) -; ZVFHMIN32-NEXT: lh a0, 500(sp) -; ZVFHMIN32-NEXT: lh a1, 244(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 122(sp) -; ZVFHMIN32-NEXT: lh a0, 498(sp) -; ZVFHMIN32-NEXT: lh a1, 242(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v24, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v12, v24, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v14, v24, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v18, v24, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v20, v24, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v24, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v24, v24, 8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 121(sp) -; ZVFHMIN32-NEXT: lh a2, 496(sp) -; ZVFHMIN32-NEXT: lh a3, 240(sp) -; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 15 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN32-NEXT: sb a5, 204(sp) +; ZVFHMIN32-NEXT: sb a4, 205(sp) +; ZVFHMIN32-NEXT: sb a2, 206(sp) +; ZVFHMIN32-NEXT: sb a3, 207(sp) ; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 120(sp) -; ZVFHMIN32-NEXT: lh a4, 494(sp) -; ZVFHMIN32-NEXT: lh a5, 238(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v10 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 14 -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 119(sp) -; ZVFHMIN32-NEXT: lh a4, 492(sp) -; ZVFHMIN32-NEXT: lh a5, 236(sp) -; ZVFHMIN32-NEXT: vmv.x.s a6, v12 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 13 -; ZVFHMIN32-NEXT: vmv.x.s a7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 118(sp) -; ZVFHMIN32-NEXT: lh a4, 490(sp) -; ZVFHMIN32-NEXT: lh a5, 234(sp) -; ZVFHMIN32-NEXT: vmv.x.s t0, v14 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 12 -; ZVFHMIN32-NEXT: vmv.x.s t1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 117(sp) -; ZVFHMIN32-NEXT: lh a4, 488(sp) -; ZVFHMIN32-NEXT: lh a5, 232(sp) -; ZVFHMIN32-NEXT: vmv.x.s t2, v18 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 11 -; ZVFHMIN32-NEXT: vmv.x.s t3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 116(sp) -; ZVFHMIN32-NEXT: lh a4, 486(sp) -; ZVFHMIN32-NEXT: lh a5, 230(sp) -; ZVFHMIN32-NEXT: vmv.x.s t4, v20 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 10 -; ZVFHMIN32-NEXT: vmv.x.s t5, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 115(sp) -; ZVFHMIN32-NEXT: lh a4, 484(sp) -; ZVFHMIN32-NEXT: lh a5, 228(sp) -; ZVFHMIN32-NEXT: vmv.x.s t6, v22 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 9 -; ZVFHMIN32-NEXT: vmv.x.s s2, v8 -; ZVFHMIN32-NEXT: vmv.x.s s3, v24 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 114(sp) +; ZVFHMIN32-NEXT: sb a2, 200(sp) +; ZVFHMIN32-NEXT: sb a6, 201(sp) +; ZVFHMIN32-NEXT: sb a7, 202(sp) +; ZVFHMIN32-NEXT: sb t0, 203(sp) +; ZVFHMIN32-NEXT: li a2, 128 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 76(sp) -; ZVFHMIN32-NEXT: sb a2, 77(sp) -; ZVFHMIN32-NEXT: sb a1, 78(sp) -; ZVFHMIN32-NEXT: sb a0, 79(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN32-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 8 -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN32-NEXT: sb a3, 72(sp) -; ZVFHMIN32-NEXT: sb a2, 73(sp) -; ZVFHMIN32-NEXT: sb a1, 74(sp) -; ZVFHMIN32-NEXT: sb a0, 75(sp) -; ZVFHMIN32-NEXT: li a0, 128 -; ZVFHMIN32-NEXT: mv a1, sp -; ZVFHMIN32-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; ZVFHMIN32-NEXT: vle8.v v8, (a1) +; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: sb a0, 242(sp) +; ZVFHMIN32-NEXT: addi a0, sp, 128 +; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN32-NEXT: vle8.v v8, (a0) ; ZVFHMIN32-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0 -; ZVFHMIN32-NEXT: addi sp, s0, -768 -; ZVFHMIN32-NEXT: .cfi_def_cfa sp, 768 -; ZVFHMIN32-NEXT: lw ra, 764(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s0, 760(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s2, 756(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s3, 752(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, s0, -896 +; ZVFHMIN32-NEXT: .cfi_def_cfa sp, 896 +; ZVFHMIN32-NEXT: lw ra, 892(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s0, 888(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s2, 884(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s3, 880(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s4, 876(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s5, 872(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s6, 868(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s7, 864(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s8, 860(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s9, 856(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s10, 852(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s11, 848(sp) # 4-byte Folded Reload ; ZVFHMIN32-NEXT: .cfi_restore ra ; ZVFHMIN32-NEXT: .cfi_restore s0 ; ZVFHMIN32-NEXT: .cfi_restore s2 ; ZVFHMIN32-NEXT: .cfi_restore s3 -; ZVFHMIN32-NEXT: addi sp, sp, 768 +; ZVFHMIN32-NEXT: .cfi_restore s4 +; ZVFHMIN32-NEXT: .cfi_restore s5 +; ZVFHMIN32-NEXT: .cfi_restore s6 +; ZVFHMIN32-NEXT: .cfi_restore s7 +; ZVFHMIN32-NEXT: .cfi_restore s8 +; ZVFHMIN32-NEXT: .cfi_restore s9 +; ZVFHMIN32-NEXT: .cfi_restore s10 +; ZVFHMIN32-NEXT: .cfi_restore s11 +; ZVFHMIN32-NEXT: addi sp, sp, 896 ; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN32-NEXT: ret ; ; ZVFHMIN64-LABEL: fcmp_oeq_vv_v128f16: ; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -768 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 768 -; ZVFHMIN64-NEXT: sd ra, 760(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s0, 752(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s2, 744(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s3, 736(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: addi sp, sp, -896 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 896 +; ZVFHMIN64-NEXT: sd ra, 888(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s0, 880(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s2, 872(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s3, 864(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s4, 856(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s5, 848(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s6, 840(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s7, 832(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s8, 824(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s9, 816(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s10, 808(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s11, 800(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: .cfi_offset ra, -8 ; ZVFHMIN64-NEXT: .cfi_offset s0, -16 ; ZVFHMIN64-NEXT: .cfi_offset s2, -24 ; ZVFHMIN64-NEXT: .cfi_offset s3, -32 -; ZVFHMIN64-NEXT: addi s0, sp, 768 +; ZVFHMIN64-NEXT: .cfi_offset s4, -40 +; ZVFHMIN64-NEXT: .cfi_offset s5, -48 +; ZVFHMIN64-NEXT: .cfi_offset s6, -56 +; ZVFHMIN64-NEXT: .cfi_offset s7, -64 +; ZVFHMIN64-NEXT: .cfi_offset s8, -72 +; ZVFHMIN64-NEXT: .cfi_offset s9, -80 +; ZVFHMIN64-NEXT: .cfi_offset s10, -88 +; ZVFHMIN64-NEXT: .cfi_offset s11, -96 +; ZVFHMIN64-NEXT: addi s0, sp, 896 ; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: li a2, 30 +; ZVFHMIN64-NEXT: mul a1, a1, a2 +; ZVFHMIN64-NEXT: sub sp, sp, a1 ; ZVFHMIN64-NEXT: andi sp, sp, -128 ; ZVFHMIN64-NEXT: addi a1, a0, 128 ; ZVFHMIN64-NEXT: li a2, 64 +; ZVFHMIN64-NEXT: addi a3, sp, 640 +; ZVFHMIN64-NEXT: addi a4, sp, 384 +; ZVFHMIN64-NEXT: addi a5, sp, 512 ; ZVFHMIN64-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; ZVFHMIN64-NEXT: vle16.v v24, (a1) ; ZVFHMIN64-NEXT: vle16.v v0, (a0) -; ZVFHMIN64-NEXT: addi a0, sp, 512 -; ZVFHMIN64-NEXT: vse16.v v8, (a0) ; ZVFHMIN64-NEXT: addi a0, sp, 256 -; ZVFHMIN64-NEXT: vse16.v v0, (a0) -; ZVFHMIN64-NEXT: addi a0, sp, 384 -; ZVFHMIN64-NEXT: vse16.v v16, (a0) -; ZVFHMIN64-NEXT: addi a0, sp, 128 +; ZVFHMIN64-NEXT: vle16.v v24, (a1) +; ZVFHMIN64-NEXT: vse16.v v8, (a3) +; ZVFHMIN64-NEXT: vse16.v v0, (a4) +; ZVFHMIN64-NEXT: vse16.v v16, (a5) ; ZVFHMIN64-NEXT: vse16.v v24, (a0) +; ZVFHMIN64-NEXT: lh a0, 704(sp) +; ZVFHMIN64-NEXT: lh a1, 448(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 160(sp) +; ZVFHMIN64-NEXT: lh a0, 702(sp) +; ZVFHMIN64-NEXT: lh a1, 446(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 159(sp) +; ZVFHMIN64-NEXT: lh a0, 700(sp) +; ZVFHMIN64-NEXT: lh a1, 444(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 158(sp) +; ZVFHMIN64-NEXT: lh a0, 698(sp) +; ZVFHMIN64-NEXT: lh a1, 442(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 157(sp) +; ZVFHMIN64-NEXT: lh a0, 696(sp) +; ZVFHMIN64-NEXT: lh a1, 440(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 156(sp) +; ZVFHMIN64-NEXT: lh a0, 694(sp) +; ZVFHMIN64-NEXT: lh a1, 438(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 155(sp) +; ZVFHMIN64-NEXT: lh a0, 692(sp) +; ZVFHMIN64-NEXT: lh a1, 436(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 154(sp) +; ZVFHMIN64-NEXT: lh a0, 690(sp) +; ZVFHMIN64-NEXT: lh a1, 434(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 153(sp) +; ZVFHMIN64-NEXT: lh a0, 688(sp) +; ZVFHMIN64-NEXT: lh a1, 432(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 152(sp) +; ZVFHMIN64-NEXT: lh a0, 686(sp) +; ZVFHMIN64-NEXT: lh a1, 430(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 151(sp) +; ZVFHMIN64-NEXT: lh a0, 684(sp) +; ZVFHMIN64-NEXT: lh a1, 428(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 150(sp) +; ZVFHMIN64-NEXT: lh a0, 682(sp) +; ZVFHMIN64-NEXT: lh a1, 426(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 149(sp) +; ZVFHMIN64-NEXT: lh a0, 680(sp) +; ZVFHMIN64-NEXT: lh a1, 424(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 148(sp) +; ZVFHMIN64-NEXT: lh a0, 678(sp) +; ZVFHMIN64-NEXT: lh a1, 422(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 147(sp) +; ZVFHMIN64-NEXT: lh a0, 676(sp) +; ZVFHMIN64-NEXT: lh a1, 420(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 146(sp) +; ZVFHMIN64-NEXT: lh a0, 674(sp) +; ZVFHMIN64-NEXT: lh a1, 418(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: vmv.x.s a2, v0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: sb a0, 145(sp) +; ZVFHMIN64-NEXT: lh a0, 672(sp) +; ZVFHMIN64-NEXT: lh a1, 416(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a2, 128(sp) +; ZVFHMIN64-NEXT: sb a0, 144(sp) ; ZVFHMIN64-NEXT: lh a0, 576(sp) ; ZVFHMIN64-NEXT: lh a1, 320(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 32(sp) +; ZVFHMIN64-NEXT: sb a0, 224(sp) ; ZVFHMIN64-NEXT: lh a0, 574(sp) ; ZVFHMIN64-NEXT: lh a1, 318(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 31(sp) +; ZVFHMIN64-NEXT: sb a0, 223(sp) ; ZVFHMIN64-NEXT: lh a0, 572(sp) ; ZVFHMIN64-NEXT: lh a1, 316(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 30(sp) +; ZVFHMIN64-NEXT: sb a0, 222(sp) ; ZVFHMIN64-NEXT: lh a0, 570(sp) ; ZVFHMIN64-NEXT: lh a1, 314(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 29(sp) +; ZVFHMIN64-NEXT: sb a0, 221(sp) ; ZVFHMIN64-NEXT: lh a0, 568(sp) ; ZVFHMIN64-NEXT: lh a1, 312(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 28(sp) +; ZVFHMIN64-NEXT: sb a0, 220(sp) ; ZVFHMIN64-NEXT: lh a0, 566(sp) ; ZVFHMIN64-NEXT: lh a1, 310(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 27(sp) +; ZVFHMIN64-NEXT: sb a0, 219(sp) ; ZVFHMIN64-NEXT: lh a0, 564(sp) ; ZVFHMIN64-NEXT: lh a1, 308(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 26(sp) +; ZVFHMIN64-NEXT: sb a0, 218(sp) ; ZVFHMIN64-NEXT: lh a0, 562(sp) ; ZVFHMIN64-NEXT: lh a1, 306(sp) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 29 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 6 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 28 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 5 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 27 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 4 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 26 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 3 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 25 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 1 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 23 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 1 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8 +; ZVFHMIN64-NEXT: vmv.x.s a4, v16 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 25(sp) +; ZVFHMIN64-NEXT: sb a0, 217(sp) ; ZVFHMIN64-NEXT: lh a0, 560(sp) ; ZVFHMIN64-NEXT: lh a1, 304(sp) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 20 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 22 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 18 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 24(sp) +; ZVFHMIN64-NEXT: sb a0, 216(sp) ; ZVFHMIN64-NEXT: lh a0, 558(sp) ; ZVFHMIN64-NEXT: lh a1, 302(sp) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 3 +; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 2 +; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 6 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 12 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 10 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN64-NEXT: addi a2, sp, 800 +; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s t4, v26 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 23(sp) +; ZVFHMIN64-NEXT: sb a0, 215(sp) ; ZVFHMIN64-NEXT: lh a0, 556(sp) ; ZVFHMIN64-NEXT: lh a1, 300(sp) +; ZVFHMIN64-NEXT: vmv.x.s t3, v20 +; ZVFHMIN64-NEXT: vmv.x.s t1, v28 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 22(sp) +; ZVFHMIN64-NEXT: sb a0, 214(sp) ; ZVFHMIN64-NEXT: lh a0, 554(sp) ; ZVFHMIN64-NEXT: lh a1, 298(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 1 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t2, v0 +; ZVFHMIN64-NEXT: vmv.x.s t0, v4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 21(sp) +; ZVFHMIN64-NEXT: sb a0, 213(sp) ; ZVFHMIN64-NEXT: lh a0, 552(sp) ; ZVFHMIN64-NEXT: lh a1, 296(sp) +; ZVFHMIN64-NEXT: vmv.x.s a7, v2 +; ZVFHMIN64-NEXT: vmv.x.s a6, v30 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 20(sp) +; ZVFHMIN64-NEXT: sb a0, 212(sp) ; ZVFHMIN64-NEXT: lh a0, 550(sp) ; ZVFHMIN64-NEXT: lh a1, 294(sp) +; ZVFHMIN64-NEXT: vmv.x.s a5, v22 +; ZVFHMIN64-NEXT: vmv.x.s a2, v18 +; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 19(sp) -; ZVFHMIN64-NEXT: lh a0, 548(sp) -; ZVFHMIN64-NEXT: lh a1, 292(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 18(sp) -; ZVFHMIN64-NEXT: lh a0, 546(sp) -; ZVFHMIN64-NEXT: lh a1, 290(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 17(sp) -; ZVFHMIN64-NEXT: lh a0, 544(sp) -; ZVFHMIN64-NEXT: lh a1, 288(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: vmv.x.s a1, v0 +; ZVFHMIN64-NEXT: sb a0, 211(sp) +; ZVFHMIN64-NEXT: lh a1, 548(sp) +; ZVFHMIN64-NEXT: lh t5, 292(sp) +; ZVFHMIN64-NEXT: vmv.x.s a0, v14 +; ZVFHMIN64-NEXT: sd a0, 104(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: sd a0, 120(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: sb a1, 0(sp) -; ZVFHMIN64-NEXT: sb a0, 16(sp) -; ZVFHMIN64-NEXT: lh a0, 448(sp) -; ZVFHMIN64-NEXT: lh a1, 192(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 210(sp) +; ZVFHMIN64-NEXT: lh a1, 546(sp) +; ZVFHMIN64-NEXT: lh t5, 290(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: vmv.x.s a4, v24 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 96(sp) -; ZVFHMIN64-NEXT: lh a0, 446(sp) -; ZVFHMIN64-NEXT: lh a1, 190(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 +; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN64-NEXT: sb a1, 209(sp) +; ZVFHMIN64-NEXT: lh a1, 544(sp) +; ZVFHMIN64-NEXT: lh t5, 288(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a4, 192(sp) +; ZVFHMIN64-NEXT: sb a1, 208(sp) +; ZVFHMIN64-NEXT: lh t5, 738(sp) +; ZVFHMIN64-NEXT: lh t6, 482(sp) +; ZVFHMIN64-NEXT: vmv.x.s a0, v12 +; ZVFHMIN64-NEXT: sd a0, 88(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s a0, v10 +; ZVFHMIN64-NEXT: sd a0, 112(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN64-NEXT: sb t5, 177(sp) +; ZVFHMIN64-NEXT: lh t5, 736(sp) +; ZVFHMIN64-NEXT: lh t6, 480(sp) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 29 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s5, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 28 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s6, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN64-NEXT: sb t5, 176(sp) +; ZVFHMIN64-NEXT: lh t5, 734(sp) +; ZVFHMIN64-NEXT: lh t6, 478(sp) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 27 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s7, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 26 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s8, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN64-NEXT: sb t5, 175(sp) +; ZVFHMIN64-NEXT: lh t5, 732(sp) +; ZVFHMIN64-NEXT: lh t6, 476(sp) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 25 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s4, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 24 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s3, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 +; ZVFHMIN64-NEXT: sb t5, 174(sp) +; ZVFHMIN64-NEXT: lh t6, 730(sp) +; ZVFHMIN64-NEXT: lh s9, 474(sp) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a1, 23 +; ZVFHMIN64-NEXT: mul a0, a0, a1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh s2, 800(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t5, v3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4 +; ZVFHMIN64-NEXT: sb t6, 173(sp) +; ZVFHMIN64-NEXT: lh s9, 728(sp) +; ZVFHMIN64-NEXT: lh s10, 472(sp) +; ZVFHMIN64-NEXT: vmv.x.s t6, v31 +; ZVFHMIN64-NEXT: vmv.x.s ra, v13 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 +; ZVFHMIN64-NEXT: sb s9, 172(sp) +; ZVFHMIN64-NEXT: lh s9, 726(sp) +; ZVFHMIN64-NEXT: lh s10, 470(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v29 +; ZVFHMIN64-NEXT: vmv.x.s a3, v11 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 +; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 +; ZVFHMIN64-NEXT: sb s9, 171(sp) +; ZVFHMIN64-NEXT: lh s10, 724(sp) +; ZVFHMIN64-NEXT: lh s11, 468(sp) +; ZVFHMIN64-NEXT: vmv.x.s a4, v7 +; ZVFHMIN64-NEXT: vmv.x.s s9, v9 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s11 +; ZVFHMIN64-NEXT: feq.h s10, fa5, fa4 +; ZVFHMIN64-NEXT: sb s10, 170(sp) +; ZVFHMIN64-NEXT: lh a0, 722(sp) +; ZVFHMIN64-NEXT: lh a1, 466(sp) +; ZVFHMIN64-NEXT: vmv.x.s s10, v21 +; ZVFHMIN64-NEXT: vmv.x.s s11, v27 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 95(sp) -; ZVFHMIN64-NEXT: lh a0, 444(sp) -; ZVFHMIN64-NEXT: lh a1, 188(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: sb a0, 169(sp) +; ZVFHMIN64-NEXT: lh a0, 720(sp) +; ZVFHMIN64-NEXT: lh a1, 464(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 168(sp) +; ZVFHMIN64-NEXT: lh a0, 718(sp) +; ZVFHMIN64-NEXT: lh a1, 462(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, s7 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s8 +; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 +; ZVFHMIN64-NEXT: fmv.h.x fa1, ra +; ZVFHMIN64-NEXT: sb a0, 167(sp) +; ZVFHMIN64-NEXT: lh a0, 716(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN64-NEXT: lh a1, 460(sp) +; ZVFHMIN64-NEXT: feq.h s5, fa5, fa1 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 94(sp) -; ZVFHMIN64-NEXT: lh a0, 442(sp) -; ZVFHMIN64-NEXT: lh a1, 186(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN64-NEXT: sb a1, 166(sp) +; ZVFHMIN64-NEXT: lh a1, 714(sp) +; ZVFHMIN64-NEXT: lh a2, 458(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h a3, fa3, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN64-NEXT: sb a1, 165(sp) +; ZVFHMIN64-NEXT: lh a1, 712(sp) +; ZVFHMIN64-NEXT: lh a2, 456(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 +; ZVFHMIN64-NEXT: feq.h a4, fa2, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa3, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, s2 +; ZVFHMIN64-NEXT: sb a1, 164(sp) +; ZVFHMIN64-NEXT: lh a1, 710(sp) +; ZVFHMIN64-NEXT: lh a2, 454(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, s9 +; ZVFHMIN64-NEXT: feq.h s2, fa5, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN64-NEXT: sb a1, 163(sp) +; ZVFHMIN64-NEXT: lh a1, 708(sp) +; ZVFHMIN64-NEXT: lh a2, 452(sp) +; ZVFHMIN64-NEXT: feq.h s3, fa4, fa5 +; ZVFHMIN64-NEXT: feq.h s4, fa3, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 162(sp) +; ZVFHMIN64-NEXT: lh a1, 706(sp) +; ZVFHMIN64-NEXT: lh a2, 450(sp) +; ZVFHMIN64-NEXT: sb s4, 129(sp) +; ZVFHMIN64-NEXT: sb s3, 130(sp) +; ZVFHMIN64-NEXT: sb s2, 131(sp) +; ZVFHMIN64-NEXT: sb a4, 132(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a3, 133(sp) +; ZVFHMIN64-NEXT: sb a0, 134(sp) +; ZVFHMIN64-NEXT: sb s5, 135(sp) +; ZVFHMIN64-NEXT: sb a1, 161(sp) +; ZVFHMIN64-NEXT: lh a0, 610(sp) +; ZVFHMIN64-NEXT: lh a1, 354(sp) +; ZVFHMIN64-NEXT: vmv.x.s s6, v5 +; ZVFHMIN64-NEXT: vmv.x.s s5, v23 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 93(sp) -; ZVFHMIN64-NEXT: lh a0, 440(sp) -; ZVFHMIN64-NEXT: lh a1, 184(sp) +; ZVFHMIN64-NEXT: sb a0, 241(sp) +; ZVFHMIN64-NEXT: lh a0, 608(sp) +; ZVFHMIN64-NEXT: lh a1, 352(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 21 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 20 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 92(sp) -; ZVFHMIN64-NEXT: lh a0, 438(sp) -; ZVFHMIN64-NEXT: lh a1, 182(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: sb a0, 240(sp) +; ZVFHMIN64-NEXT: lh a0, 606(sp) +; ZVFHMIN64-NEXT: lh a1, 350(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 22 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 +; ZVFHMIN64-NEXT: sb a0, 239(sp) +; ZVFHMIN64-NEXT: lh a0, 604(sp) +; ZVFHMIN64-NEXT: lh a1, 348(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 238(sp) +; ZVFHMIN64-NEXT: lh a0, 602(sp) +; ZVFHMIN64-NEXT: lh a1, 346(sp) +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 237(sp) +; ZVFHMIN64-NEXT: lh a0, 600(sp) +; ZVFHMIN64-NEXT: lh a1, 344(sp) +; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 236(sp) +; ZVFHMIN64-NEXT: lh a0, 598(sp) +; ZVFHMIN64-NEXT: lh a1, 342(sp) +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 235(sp) +; ZVFHMIN64-NEXT: lh a0, 596(sp) +; ZVFHMIN64-NEXT: lh a1, 340(sp) +; ZVFHMIN64-NEXT: vmv.x.s s8, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 234(sp) +; ZVFHMIN64-NEXT: lh a0, 594(sp) +; ZVFHMIN64-NEXT: lh a1, 338(sp) +; ZVFHMIN64-NEXT: vmv.x.s s9, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: sb a0, 233(sp) +; ZVFHMIN64-NEXT: lh a0, 592(sp) +; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: lh t5, 336(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 +; ZVFHMIN64-NEXT: vmv.x.s s7, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa2, t5 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN64-NEXT: sb a0, 232(sp) +; ZVFHMIN64-NEXT: lh a0, 590(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa2, a3 +; ZVFHMIN64-NEXT: lh a2, 334(sp) +; ZVFHMIN64-NEXT: feq.h t5, fa5, fa3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: feq.h t6, fa4, fa2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 91(sp) -; ZVFHMIN64-NEXT: lh a0, 436(sp) -; ZVFHMIN64-NEXT: lh a1, 180(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s6 +; ZVFHMIN64-NEXT: sb a0, 231(sp) +; ZVFHMIN64-NEXT: lh a0, 588(sp) +; ZVFHMIN64-NEXT: lh a2, 332(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 90(sp) -; ZVFHMIN64-NEXT: lh a0, 434(sp) -; ZVFHMIN64-NEXT: lh a1, 178(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 +; ZVFHMIN64-NEXT: sb a0, 230(sp) +; ZVFHMIN64-NEXT: lh a0, 586(sp) +; ZVFHMIN64-NEXT: lh a2, 330(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s8 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 89(sp) -; ZVFHMIN64-NEXT: lh a0, 432(sp) -; ZVFHMIN64-NEXT: lh a1, 176(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 +; ZVFHMIN64-NEXT: sb a0, 229(sp) +; ZVFHMIN64-NEXT: lh a0, 584(sp) +; ZVFHMIN64-NEXT: lh a2, 328(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 +; ZVFHMIN64-NEXT: feq.h s4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 88(sp) -; ZVFHMIN64-NEXT: lh a0, 430(sp) -; ZVFHMIN64-NEXT: lh a1, 174(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 +; ZVFHMIN64-NEXT: sb a0, 228(sp) +; ZVFHMIN64-NEXT: lh a0, 582(sp) +; ZVFHMIN64-NEXT: lh a2, 326(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 87(sp) -; ZVFHMIN64-NEXT: lh a0, 428(sp) -; ZVFHMIN64-NEXT: lh a1, 172(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 86(sp) -; ZVFHMIN64-NEXT: lh a0, 426(sp) -; ZVFHMIN64-NEXT: lh a1, 170(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN64-NEXT: sb a0, 227(sp) +; ZVFHMIN64-NEXT: lh a0, 580(sp) +; ZVFHMIN64-NEXT: lh a2, 324(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s7 +; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 85(sp) -; ZVFHMIN64-NEXT: lh a0, 424(sp) -; ZVFHMIN64-NEXT: lh a1, 168(sp) +; ZVFHMIN64-NEXT: sb a0, 226(sp) +; ZVFHMIN64-NEXT: lh a0, 578(sp) +; ZVFHMIN64-NEXT: lh a2, 322(sp) +; ZVFHMIN64-NEXT: sb s2, 193(sp) +; ZVFHMIN64-NEXT: sb a1, 194(sp) +; ZVFHMIN64-NEXT: sb s4, 195(sp) +; ZVFHMIN64-NEXT: sb a4, 196(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 84(sp) -; ZVFHMIN64-NEXT: lh a0, 422(sp) -; ZVFHMIN64-NEXT: lh a1, 166(sp) +; ZVFHMIN64-NEXT: sb a3, 197(sp) +; ZVFHMIN64-NEXT: sb t6, 198(sp) +; ZVFHMIN64-NEXT: sb t5, 199(sp) +; ZVFHMIN64-NEXT: sb a0, 225(sp) +; ZVFHMIN64-NEXT: lh a0, 766(sp) +; ZVFHMIN64-NEXT: lh a1, 510(sp) +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 18 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s2, v8 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: mul a2, a2, a3 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s t6, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 83(sp) -; ZVFHMIN64-NEXT: lh a0, 420(sp) -; ZVFHMIN64-NEXT: lh a1, 164(sp) +; ZVFHMIN64-NEXT: sb a0, 191(sp) +; ZVFHMIN64-NEXT: lh a0, 764(sp) +; ZVFHMIN64-NEXT: lh a1, 508(sp) +; ZVFHMIN64-NEXT: vmv.x.s t5, v6 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: addi a2, a2, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 82(sp) -; ZVFHMIN64-NEXT: lh a0, 418(sp) -; ZVFHMIN64-NEXT: lh a1, 162(sp) +; ZVFHMIN64-NEXT: sb a0, 190(sp) +; ZVFHMIN64-NEXT: lh a0, 762(sp) +; ZVFHMIN64-NEXT: lh a1, 506(sp) +; ZVFHMIN64-NEXT: csrr a3, vlenb +; ZVFHMIN64-NEXT: slli a3, a3, 3 +; ZVFHMIN64-NEXT: add a3, sp, a3 +; ZVFHMIN64-NEXT: addi a3, a3, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a3, v8 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: li s3, 6 +; ZVFHMIN64-NEXT: mul a4, a4, s3 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a4, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 81(sp) -; ZVFHMIN64-NEXT: lh a0, 416(sp) -; ZVFHMIN64-NEXT: lh a1, 160(sp) +; ZVFHMIN64-NEXT: sb a0, 189(sp) +; ZVFHMIN64-NEXT: lh a0, 760(sp) +; ZVFHMIN64-NEXT: lh a1, 504(sp) +; ZVFHMIN64-NEXT: csrr s3, vlenb +; ZVFHMIN64-NEXT: li s4, 12 +; ZVFHMIN64-NEXT: mul s3, s3, s4 +; ZVFHMIN64-NEXT: add s3, sp, s3 +; ZVFHMIN64-NEXT: addi s3, s3, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s6, v8 +; ZVFHMIN64-NEXT: csrr s3, vlenb +; ZVFHMIN64-NEXT: li s4, 10 +; ZVFHMIN64-NEXT: mul s3, s3, s4 +; ZVFHMIN64-NEXT: add s3, sp, s3 +; ZVFHMIN64-NEXT: addi s3, s3, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s4, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: vmv.x.s a1, v24 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: vmv.x.s a1, v16 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: sb a1, 64(sp) -; ZVFHMIN64-NEXT: sb a0, 80(sp) -; ZVFHMIN64-NEXT: lh a0, 610(sp) -; ZVFHMIN64-NEXT: lh a1, 354(sp) +; ZVFHMIN64-NEXT: sb a0, 188(sp) +; ZVFHMIN64-NEXT: lh a0, 758(sp) +; ZVFHMIN64-NEXT: lh a1, 502(sp) +; ZVFHMIN64-NEXT: csrr s3, vlenb +; ZVFHMIN64-NEXT: slli s3, s3, 4 +; ZVFHMIN64-NEXT: add s3, sp, s3 +; ZVFHMIN64-NEXT: addi s3, s3, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s5, v8 +; ZVFHMIN64-NEXT: vmv.x.s s3, v16 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 49(sp) -; ZVFHMIN64-NEXT: lh a0, 608(sp) -; ZVFHMIN64-NEXT: lh a1, 352(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 +; ZVFHMIN64-NEXT: sb a0, 187(sp) +; ZVFHMIN64-NEXT: lh a0, 756(sp) +; ZVFHMIN64-NEXT: lh a1, 500(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 48(sp) -; ZVFHMIN64-NEXT: lh a0, 606(sp) -; ZVFHMIN64-NEXT: lh a1, 350(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 +; ZVFHMIN64-NEXT: sb a0, 186(sp) +; ZVFHMIN64-NEXT: lh a0, 754(sp) +; ZVFHMIN64-NEXT: lh a1, 498(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 47(sp) -; ZVFHMIN64-NEXT: lh a1, 604(sp) -; ZVFHMIN64-NEXT: lh a2, 348(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 7 -; ZVFHMIN64-NEXT: vmv.x.s a0, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 46(sp) -; ZVFHMIN64-NEXT: lh a2, 602(sp) -; ZVFHMIN64-NEXT: lh a3, 346(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 -; ZVFHMIN64-NEXT: vmv.x.s a1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 45(sp) -; ZVFHMIN64-NEXT: lh a3, 600(sp) -; ZVFHMIN64-NEXT: lh a4, 344(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 6 -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 44(sp) -; ZVFHMIN64-NEXT: lh a4, 598(sp) -; ZVFHMIN64-NEXT: lh a5, 342(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 6 -; ZVFHMIN64-NEXT: vmv.x.s a3, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 43(sp) -; ZVFHMIN64-NEXT: lh a5, 596(sp) -; ZVFHMIN64-NEXT: lh a6, 340(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 5 -; ZVFHMIN64-NEXT: vmv.x.s a4, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN64-NEXT: sb a5, 42(sp) -; ZVFHMIN64-NEXT: lh a6, 594(sp) -; ZVFHMIN64-NEXT: lh a7, 338(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 5 -; ZVFHMIN64-NEXT: vmv.x.s a5, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: sb a6, 41(sp) -; ZVFHMIN64-NEXT: lh a7, 592(sp) -; ZVFHMIN64-NEXT: lh t0, 336(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 4 -; ZVFHMIN64-NEXT: vmv.x.s a6, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: sb a7, 40(sp) -; ZVFHMIN64-NEXT: lh t0, 590(sp) -; ZVFHMIN64-NEXT: lh t1, 334(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 4 -; ZVFHMIN64-NEXT: vmv.x.s a7, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN64-NEXT: sb t0, 39(sp) -; ZVFHMIN64-NEXT: lh t1, 588(sp) -; ZVFHMIN64-NEXT: lh t2, 332(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 3 -; ZVFHMIN64-NEXT: vmv.x.s t0, v10 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: sb a0, 185(sp) +; ZVFHMIN64-NEXT: lh a0, 752(sp) +; ZVFHMIN64-NEXT: lh a1, 496(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 ; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN64-NEXT: sb t1, 38(sp) -; ZVFHMIN64-NEXT: lh t2, 586(sp) -; ZVFHMIN64-NEXT: lh t3, 330(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 3 -; ZVFHMIN64-NEXT: vmv.x.s t1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 37(sp) -; ZVFHMIN64-NEXT: lh t2, 584(sp) -; ZVFHMIN64-NEXT: lh t3, 328(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 2 -; ZVFHMIN64-NEXT: vmv.x.s t4, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 36(sp) -; ZVFHMIN64-NEXT: lh t2, 582(sp) -; ZVFHMIN64-NEXT: lh t3, 326(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 2 -; ZVFHMIN64-NEXT: vmv.x.s t5, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 35(sp) -; ZVFHMIN64-NEXT: lh t2, 580(sp) -; ZVFHMIN64-NEXT: lh t3, 324(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 1 -; ZVFHMIN64-NEXT: vmv.x.s t6, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 34(sp) -; ZVFHMIN64-NEXT: lh t2, 578(sp) -; ZVFHMIN64-NEXT: lh t3, 322(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 1 -; ZVFHMIN64-NEXT: vmv.x.s s2, v10 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 184(sp) +; ZVFHMIN64-NEXT: lh a0, 750(sp) +; ZVFHMIN64-NEXT: lh a1, 494(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: sb a2, 5(sp) -; ZVFHMIN64-NEXT: sb a1, 6(sp) -; ZVFHMIN64-NEXT: sb a0, 7(sp) -; ZVFHMIN64-NEXT: sb t2, 33(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 1(sp) -; ZVFHMIN64-NEXT: sb a2, 2(sp) -; ZVFHMIN64-NEXT: sb a1, 3(sp) -; ZVFHMIN64-NEXT: sb a0, 4(sp) -; ZVFHMIN64-NEXT: lh a0, 482(sp) -; ZVFHMIN64-NEXT: lh a1, 226(sp) +; ZVFHMIN64-NEXT: sb a0, 183(sp) +; ZVFHMIN64-NEXT: lh a0, 748(sp) +; ZVFHMIN64-NEXT: lh a1, 492(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 113(sp) -; ZVFHMIN64-NEXT: lh a0, 480(sp) -; ZVFHMIN64-NEXT: lh a1, 224(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 +; ZVFHMIN64-NEXT: sb a0, 182(sp) +; ZVFHMIN64-NEXT: lh a0, 746(sp) +; ZVFHMIN64-NEXT: lh a1, 490(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s5 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 112(sp) -; ZVFHMIN64-NEXT: lh a0, 478(sp) -; ZVFHMIN64-NEXT: lh a1, 222(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 +; ZVFHMIN64-NEXT: sb a0, 181(sp) +; ZVFHMIN64-NEXT: lh a0, 744(sp) +; ZVFHMIN64-NEXT: lh a1, 488(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 111(sp) -; ZVFHMIN64-NEXT: lh a1, 476(sp) -; ZVFHMIN64-NEXT: lh a2, 220(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 7 -; ZVFHMIN64-NEXT: vmv.x.s a0, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a1, 110(sp) -; ZVFHMIN64-NEXT: lh a2, 474(sp) -; ZVFHMIN64-NEXT: lh a3, 218(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 7 -; ZVFHMIN64-NEXT: vmv.x.s a1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 109(sp) -; ZVFHMIN64-NEXT: lh a3, 472(sp) -; ZVFHMIN64-NEXT: lh a4, 216(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 6 -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 108(sp) -; ZVFHMIN64-NEXT: lh a4, 470(sp) -; ZVFHMIN64-NEXT: lh a5, 214(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 6 -; ZVFHMIN64-NEXT: vmv.x.s a3, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 107(sp) -; ZVFHMIN64-NEXT: lh a5, 468(sp) -; ZVFHMIN64-NEXT: lh a6, 212(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 5 -; ZVFHMIN64-NEXT: vmv.x.s a4, v10 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 -; ZVFHMIN64-NEXT: sb a5, 106(sp) -; ZVFHMIN64-NEXT: lh a6, 466(sp) -; ZVFHMIN64-NEXT: lh a7, 210(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 5 -; ZVFHMIN64-NEXT: vmv.x.s a5, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 +; ZVFHMIN64-NEXT: addi a1, sp, 800 +; ZVFHMIN64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 +; ZVFHMIN64-NEXT: sb a0, 180(sp) +; ZVFHMIN64-NEXT: lh a0, 742(sp) +; ZVFHMIN64-NEXT: lh a7, 486(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: sb a6, 105(sp) -; ZVFHMIN64-NEXT: lh a7, 464(sp) -; ZVFHMIN64-NEXT: lh t0, 208(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 4 -; ZVFHMIN64-NEXT: vmv.x.s a6, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 -; ZVFHMIN64-NEXT: sb a7, 104(sp) -; ZVFHMIN64-NEXT: lh t0, 462(sp) -; ZVFHMIN64-NEXT: lh t1, 206(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 4 -; ZVFHMIN64-NEXT: vmv.x.s a7, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 -; ZVFHMIN64-NEXT: sb t0, 103(sp) -; ZVFHMIN64-NEXT: lh t1, 460(sp) -; ZVFHMIN64-NEXT: lh t2, 204(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 3 -; ZVFHMIN64-NEXT: vmv.x.s t0, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN64-NEXT: sb t1, 102(sp) -; ZVFHMIN64-NEXT: lh t2, 458(sp) -; ZVFHMIN64-NEXT: lh t3, 202(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 3 -; ZVFHMIN64-NEXT: vmv.x.s t1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 101(sp) -; ZVFHMIN64-NEXT: lh t2, 456(sp) -; ZVFHMIN64-NEXT: lh t3, 200(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 2 -; ZVFHMIN64-NEXT: vmv.x.s t4, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 100(sp) -; ZVFHMIN64-NEXT: lh t2, 454(sp) -; ZVFHMIN64-NEXT: lh t3, 198(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 2 -; ZVFHMIN64-NEXT: vmv.x.s t5, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 99(sp) -; ZVFHMIN64-NEXT: lh t2, 452(sp) -; ZVFHMIN64-NEXT: lh t3, 196(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 1 -; ZVFHMIN64-NEXT: vmv.x.s t6, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 -; ZVFHMIN64-NEXT: sb t2, 98(sp) -; ZVFHMIN64-NEXT: lh t2, 450(sp) -; ZVFHMIN64-NEXT: lh t3, 194(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 1 -; ZVFHMIN64-NEXT: vmv.x.s s2, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h t2, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 179(sp) +; ZVFHMIN64-NEXT: lh a0, 740(sp) +; ZVFHMIN64-NEXT: lh a7, 484(sp) +; ZVFHMIN64-NEXT: sb a2, 140(sp) +; ZVFHMIN64-NEXT: sb t1, 141(sp) +; ZVFHMIN64-NEXT: sb t3, 142(sp) +; ZVFHMIN64-NEXT: sb t4, 143(sp) +; ZVFHMIN64-NEXT: sb a1, 136(sp) +; ZVFHMIN64-NEXT: sb a6, 137(sp) +; ZVFHMIN64-NEXT: sb a4, 138(sp) +; ZVFHMIN64-NEXT: sb a3, 139(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: sb a2, 69(sp) -; ZVFHMIN64-NEXT: sb a1, 70(sp) -; ZVFHMIN64-NEXT: sb a0, 71(sp) -; ZVFHMIN64-NEXT: sb t2, 97(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 65(sp) -; ZVFHMIN64-NEXT: sb a2, 66(sp) -; ZVFHMIN64-NEXT: sb a1, 67(sp) -; ZVFHMIN64-NEXT: sb a0, 68(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 178(sp) ; ZVFHMIN64-NEXT: lh a0, 638(sp) ; ZVFHMIN64-NEXT: lh a1, 382(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 +; ZVFHMIN64-NEXT: vmv.x.s t3, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 63(sp) +; ZVFHMIN64-NEXT: sb a0, 255(sp) ; ZVFHMIN64-NEXT: lh a0, 636(sp) ; ZVFHMIN64-NEXT: lh a1, 380(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 +; ZVFHMIN64-NEXT: vmv.x.s t2, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 62(sp) +; ZVFHMIN64-NEXT: sb a0, 254(sp) ; ZVFHMIN64-NEXT: lh a0, 634(sp) ; ZVFHMIN64-NEXT: lh a1, 378(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN64-NEXT: vmv.x.s t1, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 61(sp) +; ZVFHMIN64-NEXT: sb a0, 253(sp) ; ZVFHMIN64-NEXT: lh a0, 632(sp) ; ZVFHMIN64-NEXT: lh a1, 376(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN64-NEXT: vmv.x.s t0, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 60(sp) +; ZVFHMIN64-NEXT: sb a0, 252(sp) ; ZVFHMIN64-NEXT: lh a0, 630(sp) ; ZVFHMIN64-NEXT: lh a1, 374(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN64-NEXT: vmv.x.s a7, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 59(sp) +; ZVFHMIN64-NEXT: sb a0, 251(sp) ; ZVFHMIN64-NEXT: lh a0, 628(sp) ; ZVFHMIN64-NEXT: lh a1, 372(sp) +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN64-NEXT: vmv.x.s a6, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 58(sp) +; ZVFHMIN64-NEXT: ld a1, 96(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb a0, 250(sp) ; ZVFHMIN64-NEXT: lh a0, 626(sp) ; ZVFHMIN64-NEXT: lh a1, 370(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 57(sp) +; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb a0, 249(sp) ; ZVFHMIN64-NEXT: lh a0, 624(sp) ; ZVFHMIN64-NEXT: lh a1, 368(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 +; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 56(sp) +; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb a0, 248(sp) ; ZVFHMIN64-NEXT: lh a0, 622(sp) ; ZVFHMIN64-NEXT: lh a1, 366(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v20, v0, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v0, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v26, v0, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v28, v0, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v18, v0, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v12, v0, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v14, v0, 8 -; ZVFHMIN64-NEXT: vmv.x.s a2, v20 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 55(sp) +; ZVFHMIN64-NEXT: ld a1, 88(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb a0, 247(sp) ; ZVFHMIN64-NEXT: lh a0, 620(sp) ; ZVFHMIN64-NEXT: lh a1, 364(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 15 -; ZVFHMIN64-NEXT: vmv.x.s a3, v20 -; ZVFHMIN64-NEXT: vmv.x.s a4, v22 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 +; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 54(sp) +; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: sb a0, 246(sp) ; ZVFHMIN64-NEXT: lh a0, 618(sp) ; ZVFHMIN64-NEXT: lh a1, 362(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN64-NEXT: vmv.x.s a5, v20 -; ZVFHMIN64-NEXT: vmv.x.s a6, v26 +; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 +; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 53(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN64-NEXT: sb a0, 245(sp) ; ZVFHMIN64-NEXT: lh a0, 616(sp) ; ZVFHMIN64-NEXT: lh a1, 360(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 13 -; ZVFHMIN64-NEXT: vmv.x.s a7, v20 -; ZVFHMIN64-NEXT: vmv.x.s t0, v28 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 52(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 +; ZVFHMIN64-NEXT: sb a0, 244(sp) ; ZVFHMIN64-NEXT: lh a0, 614(sp) ; ZVFHMIN64-NEXT: lh a1, 358(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 12 -; ZVFHMIN64-NEXT: vmv.x.s t1, v20 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 +; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 51(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8 +; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: sb a0, 243(sp) ; ZVFHMIN64-NEXT: lh a0, 612(sp) ; ZVFHMIN64-NEXT: lh a1, 356(sp) -; ZVFHMIN64-NEXT: vmv.x.s t2, v18 -; ZVFHMIN64-NEXT: vslidedown.vi v18, v8, 11 -; ZVFHMIN64-NEXT: vmv.x.s t3, v18 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 50(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 12(sp) -; ZVFHMIN64-NEXT: sb a2, 13(sp) -; ZVFHMIN64-NEXT: sb a1, 14(sp) -; ZVFHMIN64-NEXT: sb a0, 15(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: vmv.x.s a1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 10 -; ZVFHMIN64-NEXT: vmv.x.s a1, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: vmv.x.s a2, v12 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 9 -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: vmv.x.s a3, v14 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a3 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v8, 8 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 8(sp) -; ZVFHMIN64-NEXT: sb a2, 9(sp) -; ZVFHMIN64-NEXT: sb a1, 10(sp) -; ZVFHMIN64-NEXT: sb a0, 11(sp) -; ZVFHMIN64-NEXT: lh a0, 510(sp) -; ZVFHMIN64-NEXT: lh a1, 254(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 127(sp) -; ZVFHMIN64-NEXT: lh a0, 508(sp) -; ZVFHMIN64-NEXT: lh a1, 252(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 126(sp) -; ZVFHMIN64-NEXT: lh a0, 506(sp) -; ZVFHMIN64-NEXT: lh a1, 250(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 125(sp) -; ZVFHMIN64-NEXT: lh a0, 504(sp) -; ZVFHMIN64-NEXT: lh a1, 248(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 124(sp) -; ZVFHMIN64-NEXT: lh a0, 502(sp) -; ZVFHMIN64-NEXT: lh a1, 246(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 123(sp) -; ZVFHMIN64-NEXT: lh a0, 500(sp) -; ZVFHMIN64-NEXT: lh a1, 244(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 122(sp) -; ZVFHMIN64-NEXT: lh a0, 498(sp) -; ZVFHMIN64-NEXT: lh a1, 242(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v24, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v12, v24, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v14, v24, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v18, v24, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v20, v24, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v24, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v24, v24, 8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 121(sp) -; ZVFHMIN64-NEXT: lh a2, 496(sp) -; ZVFHMIN64-NEXT: lh a3, 240(sp) -; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 15 -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 +; ZVFHMIN64-NEXT: sb a5, 204(sp) +; ZVFHMIN64-NEXT: sb a4, 205(sp) +; ZVFHMIN64-NEXT: sb a2, 206(sp) +; ZVFHMIN64-NEXT: sb a3, 207(sp) ; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 120(sp) -; ZVFHMIN64-NEXT: lh a4, 494(sp) -; ZVFHMIN64-NEXT: lh a5, 238(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v10 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 14 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 119(sp) -; ZVFHMIN64-NEXT: lh a4, 492(sp) -; ZVFHMIN64-NEXT: lh a5, 236(sp) -; ZVFHMIN64-NEXT: vmv.x.s a6, v12 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 13 -; ZVFHMIN64-NEXT: vmv.x.s a7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 118(sp) -; ZVFHMIN64-NEXT: lh a4, 490(sp) -; ZVFHMIN64-NEXT: lh a5, 234(sp) -; ZVFHMIN64-NEXT: vmv.x.s t0, v14 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 12 -; ZVFHMIN64-NEXT: vmv.x.s t1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 117(sp) -; ZVFHMIN64-NEXT: lh a4, 488(sp) -; ZVFHMIN64-NEXT: lh a5, 232(sp) -; ZVFHMIN64-NEXT: vmv.x.s t2, v18 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 11 -; ZVFHMIN64-NEXT: vmv.x.s t3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 116(sp) -; ZVFHMIN64-NEXT: lh a4, 486(sp) -; ZVFHMIN64-NEXT: lh a5, 230(sp) -; ZVFHMIN64-NEXT: vmv.x.s t4, v20 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 10 -; ZVFHMIN64-NEXT: vmv.x.s t5, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 115(sp) -; ZVFHMIN64-NEXT: lh a4, 484(sp) -; ZVFHMIN64-NEXT: lh a5, 228(sp) -; ZVFHMIN64-NEXT: vmv.x.s t6, v22 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 9 -; ZVFHMIN64-NEXT: vmv.x.s s2, v8 -; ZVFHMIN64-NEXT: vmv.x.s s3, v24 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 114(sp) +; ZVFHMIN64-NEXT: sb a2, 200(sp) +; ZVFHMIN64-NEXT: sb a6, 201(sp) +; ZVFHMIN64-NEXT: sb a7, 202(sp) +; ZVFHMIN64-NEXT: sb t0, 203(sp) +; ZVFHMIN64-NEXT: li a2, 128 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 76(sp) -; ZVFHMIN64-NEXT: sb a2, 77(sp) -; ZVFHMIN64-NEXT: sb a1, 78(sp) -; ZVFHMIN64-NEXT: sb a0, 79(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s2 -; ZVFHMIN64-NEXT: feq.h a2, fa4, fa5 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 8 -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a3, fa4, fa5 -; ZVFHMIN64-NEXT: sb a3, 72(sp) -; ZVFHMIN64-NEXT: sb a2, 73(sp) -; ZVFHMIN64-NEXT: sb a1, 74(sp) -; ZVFHMIN64-NEXT: sb a0, 75(sp) -; ZVFHMIN64-NEXT: li a0, 128 -; ZVFHMIN64-NEXT: mv a1, sp -; ZVFHMIN64-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; ZVFHMIN64-NEXT: vle8.v v8, (a1) +; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: sb a0, 242(sp) +; ZVFHMIN64-NEXT: addi a0, sp, 128 +; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN64-NEXT: vle8.v v8, (a0) ; ZVFHMIN64-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0 -; ZVFHMIN64-NEXT: addi sp, s0, -768 -; ZVFHMIN64-NEXT: .cfi_def_cfa sp, 768 -; ZVFHMIN64-NEXT: ld ra, 760(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s0, 752(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s2, 744(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s3, 736(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, s0, -896 +; ZVFHMIN64-NEXT: .cfi_def_cfa sp, 896 +; ZVFHMIN64-NEXT: ld ra, 888(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s0, 880(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s2, 872(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s3, 864(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s4, 856(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s5, 848(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s6, 840(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s7, 832(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s8, 824(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s9, 816(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s10, 808(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s11, 800(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: .cfi_restore ra ; ZVFHMIN64-NEXT: .cfi_restore s0 ; ZVFHMIN64-NEXT: .cfi_restore s2 ; ZVFHMIN64-NEXT: .cfi_restore s3 -; ZVFHMIN64-NEXT: addi sp, sp, 768 +; ZVFHMIN64-NEXT: .cfi_restore s4 +; ZVFHMIN64-NEXT: .cfi_restore s5 +; ZVFHMIN64-NEXT: .cfi_restore s6 +; ZVFHMIN64-NEXT: .cfi_restore s7 +; ZVFHMIN64-NEXT: .cfi_restore s8 +; ZVFHMIN64-NEXT: .cfi_restore s9 +; ZVFHMIN64-NEXT: .cfi_restore s10 +; ZVFHMIN64-NEXT: .cfi_restore s11 +; ZVFHMIN64-NEXT: addi sp, sp, 896 ; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN64-NEXT: ret %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl) @@ -3435,38 +3947,38 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v6, v0, 2 ; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: bltu a2, a1, .LBB87_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB87_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v7, v8, v24, v0.t +; CHECK-NEXT: vmfeq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll index 4afa75e87c8f8..d52c42891fcc3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -28,8 +28,8 @@ define <8 x i1> @icmp_eq_vx_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t @@ -45,8 +45,8 @@ define <8 x i1> @icmp_eq_vx_swap_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; CHECK-NEXT: vmseq.vv v0, v9, v8, v0.t @@ -605,11 +605,11 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: addi a4, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a2) -; CHECK-NEXT: addi a2, a0, 128 -; CHECK-NEXT: vle8.v v8, (a2) ; CHECK-NEXT: addi a2, a3, -128 +; CHECK-NEXT: vle8.v v8, (a4) ; CHECK-NEXT: sltu a4, a3, a2 ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: addi a0, sp, 16 @@ -1253,38 +1253,38 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v6, v0, 4 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: bltu a2, a3, .LBB99_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: .LBB99_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmseq.vv v7, v8, v24, v0.t +; CHECK-NEXT: vmseq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -32 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vv v8, v16, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll index c9e6a8730eec7..8b18be908089f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll @@ -9,28 +9,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) { ; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v0 ; RV32-NEXT: slli a1, a0, 18 -; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: srli a2, a0, 31 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-NEXT: vmv.v.x v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: slli a1, a0, 27 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: slli a1, a0, 28 +; RV32-NEXT: slli a2, a0, 27 ; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: slli a1, a0, 19 +; RV32-NEXT: slli a1, a0, 26 ; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: slli a2, a0, 26 +; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: slli a1, a0, 28 ; RV32-NEXT: srli a2, a2, 31 -; RV32-NEXT: vmv.v.x v9, a2 -; RV32-NEXT: vslide1down.vx v9, v9, a1 -; RV32-NEXT: slli a1, a0, 24 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v9, v9, a1 +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: slli a2, a0, 19 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: vslide1down.vx v9, v9, a2 +; RV32-NEXT: slli a2, a0, 24 ; RV32-NEXT: slli a0, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: srli a2, a2, 31 ; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v9, v9, a2 ; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: vslidedown.vi v8, v9, 4, v0.t @@ -43,28 +43,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) { ; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v0 ; RV64-NEXT: slli a1, a0, 50 -; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: srli a2, a0, 63 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vmv.v.x v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: slli a1, a0, 59 +; RV64-NEXT: slli a2, a0, 59 ; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: slli a1, a0, 60 +; RV64-NEXT: slli a1, a0, 58 ; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: slli a1, a0, 51 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: slli a2, a0, 58 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: slli a1, a0, 60 ; RV64-NEXT: srli a2, a2, 63 -; RV64-NEXT: vmv.v.x v9, a2 -; RV64-NEXT: vslide1down.vx v9, v9, a1 -; RV64-NEXT: slli a1, a0, 56 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v9, v9, a1 +; RV64-NEXT: vslide1down.vx v8, v8, a2 +; RV64-NEXT: slli a2, a0, 51 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: vslide1down.vx v9, v9, a2 +; RV64-NEXT: slli a2, a0, 56 ; RV64-NEXT: slli a0, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: srli a2, a2, 63 ; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: vslide1down.vx v9, v9, a2 ; RV64-NEXT: vmv.v.i v0, 15 ; RV64-NEXT: vslide1down.vx v9, v9, a0 ; RV64-NEXT: vslidedown.vi v8, v9, 4, v0.t @@ -80,10 +80,10 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsrl.vi v10, v10, 1 ; CHECK-NEXT: vrsub.vi v11, v10, 3 ; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 4 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -97,21 +97,20 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) { define <4 x i32> @v4i32_v16i32(<16 x i32>) { ; RV32-LABEL: v4i32_v16i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 1 -; RV32-NEXT: vmv.v.i v13, 6 -; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV32-NEXT: vslideup.vi v13, v12, 1 ; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmv4r.v v20, v8 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vmv.v.i v8, 1 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv.v.i v10, 6 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vmv2r.v v22, v14 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.i v0, 10 +; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV32-NEXT: vslideup.vi v10, v8, 1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32-NEXT: vnsrl.wx v8, v20, a0 -; RV32-NEXT: vrgatherei16.vv v8, v16, v13, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v4i32_v16i32: @@ -121,14 +120,15 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) { ; RV64-NEXT: vmv4r.v v20, v8 ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 10 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vnsrl.wx v8, v20, a0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.i v0, 10 ; RV64-NEXT: li a0, 3 ; RV64-NEXT: slli a0, a0, 33 ; RV64-NEXT: addi a0, a0, 1 ; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vrgatherei16.vv v8, v16, v10, v0.t @@ -151,21 +151,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) { ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: li a0, 32 ; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v16, v8, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vse32.v v8, (a1) -; RV32-NEXT: lw a0, 36(sp) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: lw a0, 120(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 4 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: lw a0, 36(sp) +; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v8, v10, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: lw a1, 120(sp) +; RV32-NEXT: vslide1down.vx v9, v9, a0 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslide1down.vx v8, v9, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: addi sp, s0, -256 ; RV32-NEXT: .cfi_def_cfa sp, 256 ; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload @@ -189,21 +189,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) { ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: li a0, 32 ; RV64-NEXT: mv a1, sp +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v16, v8, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV64-NEXT: vse32.v v8, (a1) -; RV64-NEXT: lw a0, 36(sp) -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vmv.x.s a1, v10 -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: lw a0, 120(sp) ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 4 -; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: lw a0, 36(sp) +; RV64-NEXT: vmv.x.s a1, v16 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vslide1down.vx v8, v10, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: lw a1, 120(sp) +; RV64-NEXT: vslide1down.vx v9, v9, a0 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslide1down.vx v8, v9, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: addi sp, s0, -256 ; RV64-NEXT: .cfi_def_cfa sp, 256 ; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload @@ -251,22 +251,24 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) { ; CHECK-LABEL: v16i32_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 2 -; CHECK-NEXT: addi a1, a0, 265 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vmerge.vim v10, v9, 2, v0 +; CHECK-NEXT: addi a1, a0, 265 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 4 ; CHECK-NEXT: addi a1, a1, 548 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: addi a0, a0, -1856 -; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vmerge.vim v9, v10, 1, v0 +; CHECK-NEXT: vmerge.vim v9, v9, 0, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vsext.vf2 v16, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -285,22 +287,23 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) { ; CHECK-NEXT: addi a1, a1, 1161 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: lui a1, 270865 +; CHECK-NEXT: addi a1, a1, 548 +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: lui a1, 100550 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 3 -; CHECK-NEXT: vmerge.vim v10, v10, 2, v0 -; CHECK-NEXT: lui a0, 270865 -; CHECK-NEXT: addi a0, a0, 548 +; CHECK-NEXT: addi a0, a1, 64 +; CHECK-NEXT: vmerge.vim v18, v10, 2, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: lui a0, 100550 -; CHECK-NEXT: addi a0, a0, 64 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 +; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v18, 0, v0 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmerge.vim v16, v18, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vsext.vf2 v24, v10 +; CHECK-NEXT: vsext.vf2 v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 ; CHECK-NEXT: vmv.v.v v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index 925366e8b1d50..38026bb591f79 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -51,12 +51,10 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLA-NEXT: vslideup.vi v14, v15, 1 ; VLA-NEXT: vslideup.vi v12, v13, 1 -; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; VLA-NEXT: vslideup.vi v12, v14, 2 -; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLA-NEXT: vslideup.vi v10, v11, 1 ; VLA-NEXT: vslideup.vi v8, v9, 1 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 2 ; VLA-NEXT: vslideup.vi v8, v10, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; VLA-NEXT: vslideup.vi v8, v12, 4 @@ -65,9 +63,9 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x ; VLS-LABEL: concat_8xv1i32: ; VLS: # %bb.0: ; VLS-NEXT: vmv1r.v v17, v12 +; VLS-NEXT: vmv1r.v v16, v8 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vslideup.vi v14, v15, 1 -; VLS-NEXT: vmv1r.v v16, v8 ; VLS-NEXT: vslideup.vi v17, v13, 1 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLS-NEXT: vslideup.vi v17, v14, 2 @@ -131,12 +129,10 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLA-NEXT: vslideup.vi v14, v15, 2 ; VLA-NEXT: vslideup.vi v12, v13, 2 -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vslideup.vi v12, v14, 4 -; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLA-NEXT: vslideup.vi v10, v11, 2 ; VLA-NEXT: vslideup.vi v8, v9, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 ; VLA-NEXT: vslideup.vi v8, v10, 4 ; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; VLA-NEXT: vslideup.vi v8, v12, 8 @@ -187,10 +183,10 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x ; VLA-NEXT: vmv2r.v v20, v14 ; VLA-NEXT: vmv2r.v v16, v12 ; VLA-NEXT: vmv2r.v v12, v10 +; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; VLA-NEXT: vslideup.vi v16, v20, 8 ; VLA-NEXT: vslideup.vi v8, v12, 8 -; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret @@ -209,22 +205,20 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; VLA: # %bb.0: ; VLA-NEXT: vmv1r.v v18, v15 ; VLA-NEXT: vmv1r.v v20, v14 -; VLA-NEXT: vmv1r.v v22, v13 +; VLA-NEXT: vmv1r.v v14, v13 ; VLA-NEXT: vmv1r.v v16, v12 -; VLA-NEXT: vmv1r.v v14, v11 +; VLA-NEXT: vmv1r.v v22, v11 ; VLA-NEXT: vmv1r.v v12, v10 ; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; VLA-NEXT: vslideup.vi v20, v18, 4 -; VLA-NEXT: vslideup.vi v16, v22, 4 -; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; VLA-NEXT: vslideup.vi v16, v20, 8 -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vslideup.vi v16, v14, 4 +; VLA-NEXT: vslideup.vi v12, v22, 4 ; VLA-NEXT: vslideup.vi v8, v10, 4 ; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v16, v20, 8 ; VLA-NEXT: vslideup.vi v8, v12, 8 -; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index a8f75f8d1c24d..f04faf5cd2c54 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -15,15 +15,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 3 ; CHECK-NEXT: vmul.vx v9, v9, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v9 -; CHECK-NEXT: vadd.vi v9, v9, -8 ; CHECK-NEXT: li a0, 56 ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vadd.vi v10, v9, -8 +; CHECK-NEXT: vrgather.vv v11, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vv v11, v8, v10, v0.t +; CHECK-NEXT: vse8.v v11, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -42,13 +42,13 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 3 ; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 24 ; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: li a0, 24 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma @@ -67,13 +67,13 @@ define void @deinterleave4_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vsll.vi v9, v9, 2 @@ -100,11 +100,11 @@ define void @deinterleave4_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vmv.v.i v9, -9 ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vmacc.vx v9, a0, v10 ; CHECK-NEXT: vsll.vi v10, v10, 2 ; CHECK-NEXT: vadd.vi v10, v10, 1 ; CHECK-NEXT: vrgather.vv v11, v8, v10 -; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -126,15 +126,15 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 5 -; CHECK-NEXT: vmul.vx v9, v9, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v9 -; CHECK-NEXT: vadd.vi v9, v9, -8 ; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vmul.vx v9, v9, a0 +; CHECK-NEXT: vadd.vi v10, v9, -8 +; CHECK-NEXT: vrgather.vv v11, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vv v11, v8, v10, v0.t +; CHECK-NEXT: vse8.v v11, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -153,8 +153,8 @@ define void @deinterleave5_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 5 ; CHECK-NEXT: vmadd.vx v10, a0, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -176,9 +176,9 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 6 +; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vmul.vx v9, v9, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v9 -; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -202,8 +202,8 @@ define void @deinterleave6_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 6 ; CHECK-NEXT: vmadd.vx v10, a0, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -225,9 +225,9 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vmul.vx v9, v9, a0 ; CHECK-NEXT: vrgather.vv v10, v8, v9 -; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -250,8 +250,8 @@ define void @deinterleave7_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vmv.v.i v9, -6 ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vmadd.vx v10, a0, v9 ; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vmadd.vx v10, a0, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index f5c45ba9ea581..407535831aeda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -93,18 +93,19 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) { ; NO-ZVBB-LABEL: reverse_v32i1: ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 32 -; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; NO-ZVBB-NEXT: vmv.v.i v8, 0 -; NO-ZVBB-NEXT: vmerge.vim v8, v8, 1, v0 ; NO-ZVBB-NEXT: csrr a1, vlenb +; NO-ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; NO-ZVBB-NEXT: vid.v v8 +; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; NO-ZVBB-NEXT: vmv.v.i v10, 0 ; NO-ZVBB-NEXT: addi a2, a1, -1 +; NO-ZVBB-NEXT: slli a1, a1, 1 +; NO-ZVBB-NEXT: vmerge.vim v10, v10, 1, v0 ; NO-ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; NO-ZVBB-NEXT: vid.v v10 -; NO-ZVBB-NEXT: vrsub.vx v10, v10, a2 +; NO-ZVBB-NEXT: vrsub.vx v8, v8, a2 ; NO-ZVBB-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; NO-ZVBB-NEXT: vrgatherei16.vv v13, v8, v10 -; NO-ZVBB-NEXT: vrgatherei16.vv v12, v9, v10 -; NO-ZVBB-NEXT: slli a1, a1, 1 +; NO-ZVBB-NEXT: vrgatherei16.vv v13, v10, v8 +; NO-ZVBB-NEXT: vrgatherei16.vv v12, v11, v8 ; NO-ZVBB-NEXT: addi a1, a1, -32 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; NO-ZVBB-NEXT: vslidedown.vx v8, v12, a1 @@ -124,23 +125,24 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) { ; NO-ZVBB-LABEL: reverse_v64i1: ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 64 +; NO-ZVBB-NEXT: csrr a1, vlenb +; NO-ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; NO-ZVBB-NEXT: vid.v v12 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; NO-ZVBB-NEXT: vmv.v.i v8, 0 -; NO-ZVBB-NEXT: vmerge.vim v12, v8, 1, v0 -; NO-ZVBB-NEXT: csrr a1, vlenb ; NO-ZVBB-NEXT: addi a2, a1, -1 +; NO-ZVBB-NEXT: slli a1, a1, 2 +; NO-ZVBB-NEXT: vmerge.vim v8, v8, 1, v0 ; NO-ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; NO-ZVBB-NEXT: vid.v v8 -; NO-ZVBB-NEXT: vrsub.vx v16, v8, a2 +; NO-ZVBB-NEXT: vrsub.vx v12, v12, a2 ; NO-ZVBB-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; NO-ZVBB-NEXT: vrgatherei16.vv v11, v12, v16 -; NO-ZVBB-NEXT: vrgatherei16.vv v10, v13, v16 -; NO-ZVBB-NEXT: vrgatherei16.vv v9, v14, v16 -; NO-ZVBB-NEXT: vrgatherei16.vv v8, v15, v16 -; NO-ZVBB-NEXT: slli a1, a1, 2 +; NO-ZVBB-NEXT: vrgatherei16.vv v19, v8, v12 +; NO-ZVBB-NEXT: vrgatherei16.vv v18, v9, v12 +; NO-ZVBB-NEXT: vrgatherei16.vv v17, v10, v12 +; NO-ZVBB-NEXT: vrgatherei16.vv v16, v11, v12 ; NO-ZVBB-NEXT: addi a1, a1, -64 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NO-ZVBB-NEXT: vslidedown.vx v8, v8, a1 +; NO-ZVBB-NEXT: vslidedown.vx v8, v16, a1 ; NO-ZVBB-NEXT: vmsne.vi v0, v8, 0 ; NO-ZVBB-NEXT: ret ; @@ -157,13 +159,15 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) { ; CHECK-LABEL: reverse_v128i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vrsub.vx v24, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v15, v16, v24 @@ -174,7 +178,6 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) { ; CHECK-NEXT: vrgatherei16.vv v10, v21, v24 ; CHECK-NEXT: vrgatherei16.vv v9, v22, v24 ; CHECK-NEXT: vrgatherei16.vv v8, v23, v24 -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: addi a1, a1, -128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a1 @@ -253,15 +256,15 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: addi a1, a0, -1 +; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 ; CHECK-NEXT: vrgatherei16.vv v12, v9, v10 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a0 @@ -274,20 +277,20 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v15, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v14, v9, v16 -; CHECK-NEXT: vrgatherei16.vv v13, v10, v16 -; CHECK-NEXT: vrgatherei16.vv v12, v11, v16 +; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: vrsub.vx v12, v12, a1 ; CHECK-NEXT: addi a0, a0, -64 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v19, v8, v12 +; CHECK-NEXT: vrgatherei16.vv v18, v9, v12 +; CHECK-NEXT: vrgatherei16.vv v17, v10, v12 +; CHECK-NEXT: vrgatherei16.vv v16, v11, v12 ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: ret %res = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> ret <64 x i8> %res @@ -349,10 +352,10 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) { ; CHECK-LABEL: reverse_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 @@ -368,20 +371,20 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v15, v8, v16 -; CHECK-NEXT: vrgather.vv v14, v9, v16 -; CHECK-NEXT: vrgather.vv v13, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v17, v10, v12 +; CHECK-NEXT: vrgather.vv v16, v11, v12 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: ret %res = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> ret <32 x i16> %res @@ -430,14 +433,14 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) { ; CHECK-LABEL: reverse_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 -; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a0 @@ -450,10 +453,10 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) { ; CHECK-LABEL: reverse_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a1 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -491,14 +494,14 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) { ; CHECK-LABEL: reverse_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 -; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a0 @@ -511,19 +514,19 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) { ; CHECK-LABEL: reverse_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 3 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v15, v8, v16 -; CHECK-NEXT: vrgather.vv v14, v9, v16 -; CHECK-NEXT: vrgather.vv v13, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v17, v10, v12 +; CHECK-NEXT: vrgather.vv v16, v11, v12 ; CHECK-NEXT: addi a0, a0, -8 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> ret <8 x i64> %res @@ -586,10 +589,10 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) { ; CHECK-LABEL: reverse_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 @@ -605,20 +608,20 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v15, v8, v16 -; CHECK-NEXT: vrgather.vv v14, v9, v16 -; CHECK-NEXT: vrgather.vv v13, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v17, v10, v12 +; CHECK-NEXT: vrgather.vv v16, v11, v12 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: ret %res = shufflevector <32 x half> %a, <32 x half> poison, <32 x i32> ret <32 x half> %res @@ -667,14 +670,14 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) { ; CHECK-LABEL: reverse_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 -; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a0 @@ -687,10 +690,10 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) { ; CHECK-LABEL: reverse_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a1 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -728,14 +731,14 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) { ; CHECK-LABEL: reverse_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v10 ; CHECK-NEXT: vrgather.vv v12, v9, v10 -; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a0 @@ -748,19 +751,19 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) { ; CHECK-LABEL: reverse_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 3 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v15, v8, v16 -; CHECK-NEXT: vrgather.vv v14, v9, v16 -; CHECK-NEXT: vrgather.vv v13, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v17, v10, v12 +; CHECK-NEXT: vrgather.vv v16, v11, v12 ; CHECK-NEXT: addi a0, a0, -8 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> ret <8 x double> %res @@ -946,9 +949,9 @@ define <16 x i8> @reverse_v16i8_2(<8 x i8> %a, <8 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vrsub.vi v12, v11, 15 ; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu @@ -965,25 +968,24 @@ define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: lui a1, 16 +; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v9, v8, v12 -; CHECK-NEXT: vrgatherei16.vv v8, v11, v12 +; CHECK-NEXT: vrgatherei16.vv v15, v8, v12 +; CHECK-NEXT: vrgatherei16.vv v14, v9, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: addi a0, a0, -32 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v12, v12, 15 -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; CHECK-NEXT: vrsub.vi v12, v8, 15 +; CHECK-NEXT: vslidedown.vx v8, v14, a0 ; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -1035,21 +1037,21 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vrsub.vx v9, v9, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v9 -; CHECK-NEXT: vrgather.vv v12, v8, v9 -; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrgather.vv v12, v11, v9 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a1, 255 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrsub.vi v14, v8, 7 +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vslidedown.vx v8, v12, a0 -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v12, v12, 7 -; CHECK-NEXT: li a0, 255 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: vrgather.vv v8, v10, v14, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> ret <16 x i16> %res @@ -1058,30 +1060,30 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) { define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: reverse_v32i16_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: lui a1, 16 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v12 -; CHECK-NEXT: vrgather.vv v18, v9, v12 -; CHECK-NEXT: vrgather.vv v16, v8, v12 -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: vrgather.vv v19, v8, v10 +; CHECK-NEXT: vrgather.vv v18, v9, v10 +; CHECK-NEXT: vrgather.vv v16, v11, v10 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: addi a0, a0, -32 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vrsub.vi v20, v8, 15 +; CHECK-NEXT: vmv1r.v v17, v16 ; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vi v16, v16, 15 -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: vrgather.vv v8, v12, v20, v0.t ; CHECK-NEXT: ret %res = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %res @@ -1116,23 +1118,22 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vrsub.vx v9, v9, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v9 -; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v11, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vrsub.vi v14, v8, 3 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrsub.vi v12, v11, 3 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v8, v10, v12, v0.t +; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v14, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %res @@ -1141,26 +1142,25 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) { define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: reverse_v16i32_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v12 -; CHECK-NEXT: vrgather.vv v18, v9, v12 -; CHECK-NEXT: vrgather.vv v16, v8, v12 -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vmv.v.v v17, v16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vrsub.vx v14, v10, a1 +; CHECK-NEXT: vrgather.vv v11, v8, v14 +; CHECK-NEXT: vrgather.vv v10, v9, v14 +; CHECK-NEXT: vrgather.vv v8, v9, v14 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: li a1, 255 +; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: vrsub.vi v16, v14, 7 -; CHECK-NEXT: li a0, 255 -; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> @@ -1170,33 +1170,32 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) { define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: reverse_v32i32_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv4r.v v16, v12 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v17, v16, a1 -; CHECK-NEXT: vrgather.vv v23, v8, v17 -; CHECK-NEXT: vrgather.vv v22, v9, v17 -; CHECK-NEXT: vrgather.vv v21, v10, v17 -; CHECK-NEXT: vrgather.vv v20, v11, v17 -; CHECK-NEXT: vrgather.vv v16, v8, v17 -; CHECK-NEXT: vmv.v.v v17, v16 -; CHECK-NEXT: vmv4r.v v24, v12 -; CHECK-NEXT: vmv2r.v v18, v16 +; CHECK-NEXT: vrsub.vx v20, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v20 +; CHECK-NEXT: vrgather.vv v14, v9, v20 +; CHECK-NEXT: vrgather.vv v13, v10, v20 +; CHECK-NEXT: vrgather.vv v12, v11, v20 +; CHECK-NEXT: lui a1, 16 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: vrgather.vv v8, v9, v20 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vid.v v20 ; CHECK-NEXT: addi a0, a0, -32 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vi v16, v16, 15 -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vrsub.vi v24, v20, 15 +; CHECK-NEXT: vmv2r.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vrgatherei16.vv v8, v16, v24, v0.t ; CHECK-NEXT: ret %res = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> ret <32 x i32> %res @@ -1220,26 +1219,27 @@ define <4 x i64> @reverse_v4i64_2(<2 x i64> %a, < 2 x i64> %b) { define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: reverse_v8i64_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v12 -; CHECK-NEXT: vrgather.vv v18, v9, v12 -; CHECK-NEXT: vrgather.vv v16, v8, v12 -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: vrsub.vx v14, v10, a1 +; CHECK-NEXT: vrgather.vv v11, v8, v14 +; CHECK-NEXT: vrgather.vv v10, v9, v14 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v15 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vv v8, v9, v14 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vi v16, v15, 3 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v16, v14, 3 -; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> @@ -1291,21 +1291,21 @@ define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vrsub.vx v9, v9, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v9 -; CHECK-NEXT: vrgather.vv v12, v8, v9 -; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrgather.vv v12, v11, v9 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: li a1, 255 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrsub.vi v14, v8, 7 +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vslidedown.vx v8, v12, a0 -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v12, v12, 7 -; CHECK-NEXT: li a0, 255 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: vrgather.vv v8, v10, v14, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x half> %a, <8 x half> %b, <16 x i32> ret <16 x half> %res @@ -1315,20 +1315,20 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) { ; CHECK-LABEL: reverse_v32f16_2: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vx v10, v10, a1 -; CHECK-NEXT: vrgather.vv v15, v8, v10 -; CHECK-NEXT: vrgather.vv v14, v9, v10 -; CHECK-NEXT: vrgather.vv v12, v8, v10 -; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vrsub.vx v12, v10, a1 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: vrgather.vv v8, v9, v12 ; CHECK-NEXT: addi a0, a0, -32 -; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x half> %a, <16 x half> poison, <32 x i32> ret <32 x half> %res @@ -1363,23 +1363,22 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vrsub.vx v9, v9, a1 ; CHECK-NEXT: vrgather.vv v13, v8, v9 -; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v11, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vrsub.vi v14, v8, 3 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrsub.vi v12, v11, 3 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v8, v10, v12, v0.t +; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v14, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> ret <8 x float> %res @@ -1388,26 +1387,25 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) { define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: reverse_v16f32_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v12 -; CHECK-NEXT: vrgather.vv v18, v9, v12 -; CHECK-NEXT: vrgather.vv v16, v8, v12 -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vmv.v.v v17, v16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vrsub.vx v14, v10, a1 +; CHECK-NEXT: vrgather.vv v11, v8, v14 +; CHECK-NEXT: vrgather.vv v10, v9, v14 +; CHECK-NEXT: vrgather.vv v8, v9, v14 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: li a1, 255 +; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: vrsub.vi v16, v14, 7 -; CHECK-NEXT: li a0, 255 -; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> @@ -1432,26 +1430,27 @@ define <4 x double> @reverse_v4f64_2(<2 x double> %a, < 2 x double> %b) { define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: reverse_v8f64_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v12 -; CHECK-NEXT: vrgather.vv v18, v9, v12 -; CHECK-NEXT: vrgather.vv v16, v8, v12 -; CHECK-NEXT: vmv2r.v v12, v10 -; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: vrsub.vx v14, v10, a1 +; CHECK-NEXT: vrgather.vv v11, v8, v14 +; CHECK-NEXT: vrgather.vv v10, v9, v14 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v15 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vv v8, v9, v14 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vi v16, v15, 3 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v16, v14, 3 -; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> @@ -1464,18 +1463,19 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) vscale_range(16, 1024) { ; CHECK-LABEL: reverse_v256i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 256 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vx v10, v10, a2 +; CHECK-NEXT: vrsub.vx v8, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 -; CHECK-NEXT: vrgatherei16.vv v12, v9, v10 -; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vrgatherei16.vv v13, v10, v8 +; CHECK-NEXT: vrgatherei16.vv v12, v11, v8 ; CHECK-NEXT: addi a1, a1, -256 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v12, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index 3cfcb4398a1f0..02355d331e13f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -482,15 +482,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_16: @@ -528,15 +528,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_32: @@ -574,15 +574,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_48: @@ -620,15 +620,15 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vx v10, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i32_as_i64: @@ -691,15 +691,15 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_16: @@ -737,15 +737,15 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_32: @@ -783,15 +783,15 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v10, v9, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v10, a1 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vsll.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_48: @@ -829,15 +829,15 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vx v10, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64: @@ -876,15 +876,15 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: li a1, 63 +; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vx v10, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsrl.vv v12, v8, v12 +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64_exact: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 15c2c2298c0dd..8f6240e112cdd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -386,22 +386,23 @@ define void @vnsrl_0_i8_undef3(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, -32 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: lui a0, 24640 -; CHECK-NEXT: addi a0, a0, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma ; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: addi a0, a0, 6 ; CHECK-NEXT: vadd.vv v9, v9, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vadd.vi v9, v9, -8 -; CHECK-NEXT: li a0, -32 -; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vrgather.vv v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vv v11, v8, v9, v0.t +; CHECK-NEXT: vse8.v v11, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -419,18 +420,18 @@ define void @vnsrl_0_i8_undef_negative(ptr %in, ptr %out) { ; CHECK-NEXT: lui a0, %hi(.LCPI17_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI17_0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vrgather.vv v10, v8, v9 ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vadd.vi v9, v9, -8 +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: li a0, 48 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vadd.vi v9, v9, -8 +; CHECK-NEXT: vrgather.vv v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vv v11, v8, v9, v0.t +; CHECK-NEXT: vse8.v v11, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll index 5232d0d69fad0..74f2cec04f0de 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -88,17 +88,17 @@ define void @store_v6i1(ptr %p, <6 x i1> %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: seqz a1, a1 ; CHECK-NEXT: vmv.x.s a2, v0 +; CHECK-NEXT: seqz a1, a1 ; CHECK-NEXT: andi a3, a2, 2 +; CHECK-NEXT: andi a4, a2, 4 ; CHECK-NEXT: or a1, a1, a3 -; CHECK-NEXT: andi a3, a2, 4 -; CHECK-NEXT: andi a4, a2, 8 -; CHECK-NEXT: or a3, a3, a4 -; CHECK-NEXT: or a1, a1, a3 -; CHECK-NEXT: andi a3, a2, 16 +; CHECK-NEXT: andi a3, a2, 8 +; CHECK-NEXT: or a3, a4, a3 +; CHECK-NEXT: andi a4, a2, 16 ; CHECK-NEXT: andi a2, a2, -32 -; CHECK-NEXT: or a2, a3, a2 +; CHECK-NEXT: or a1, a1, a3 +; CHECK-NEXT: or a2, a4, a2 ; CHECK-NEXT: or a1, a1, a2 ; CHECK-NEXT: andi a1, a1, 63 ; CHECK-NEXT: sb a1, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index ed72883e9d052..ddde1e94abbde 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -74,9 +74,9 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { ; CHECK-NO-MISALIGN: # %bb.0: ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) +; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 +; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 -; CHECK-NO-MISALIGN-NEXT: addi a3, a0, 16 -; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a3) ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) ; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2) @@ -185,9 +185,9 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a2, a0, 6 +; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a2, a0, 2 -; CHECK-NEXT: addi a3, a0, 6 -; CHECK-NEXT: vle16.v v10, (a3) ; CHECK-NEXT: addi a0, a0, 8 ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v11, (a2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 28202dc07f956..d506842b5eff6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -54,11 +54,11 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-LABEL: gather_masked: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi a2, a0, 1024 -; CHECK-NEXT: lui a3, 983765 -; CHECK-NEXT: addi a3, a3, 873 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v0, a3 +; CHECK-NEXT: lui a4, 983765 ; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: addi a4, a4, 873 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v0, a4 ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB1_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -303,9 +303,9 @@ define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vlse8.v v9, (a0), a3 +; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse8.v v8, (a0), a3 -; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB6_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -348,9 +348,9 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read ; CHECK-NEXT: vle8.v v9, (a1) ; CHECK-NEXT: vmv1r.v v10, v8 ; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t +; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t -; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB7_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -384,14 +384,14 @@ for.cond.cleanup: ; preds = %vector.body define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: gather_pow2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), a3 +; CHECK-NEXT: vlse32.v v8, (a1), a2 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma @@ -400,7 +400,7 @@ define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: addi a1, a1, 128 -; CHECK-NEXT: bne a0, a2, .LBB8_1 +; CHECK-NEXT: bne a0, a3, .LBB8_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -432,21 +432,21 @@ for.cond.cleanup: ; preds = %vector.body define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: scatter_pow2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a1, a2 -; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: add a3, a1, a3 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB9_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: vlse32.v v9, (a0), a4 +; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 -; CHECK-NEXT: addi a1, a1, 32 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: bne a1, a2, .LBB9_1 +; CHECK-NEXT: bne a1, a3, .LBB9_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -491,14 +491,14 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: addi a4, a0, 32 ; CHECK-NEXT: addi a5, a1, -128 -; CHECK-NEXT: vlse32.v v8, (a5), a3 -; CHECK-NEXT: vlse32.v v9, (a1), a3 -; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vlse32.v v8, (a1), a3 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vlse32.v v10, (a5), a3 ; CHECK-NEXT: vle32.v v11, (a4) -; CHECK-NEXT: vadd.vv v8, v10, v8 -; CHECK-NEXT: vadd.vv v9, v11, v9 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: vse32.v v9, (a4) +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vadd.vv v8, v11, v8 +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a4) ; CHECK-NEXT: addi a0, a0, 64 ; CHECK-NEXT: addi a1, a1, 256 ; CHECK-NEXT: bne a0, a2, .LBB10_1 @@ -551,9 +551,9 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlse32.v v8, (a1), a3 ; CHECK-NEXT: vlse32.v v9, (a0), a4 +; CHECK-NEXT: addi a5, a1, 16 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a0), a4 -; CHECK-NEXT: addi a5, a1, 16 ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 4 ; CHECK-NEXT: vlse32.v v9, (a5), a4 @@ -569,10 +569,10 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: vlse32.v v8, (a5), a3 ; CHECK-NEXT: addi a5, a0, 12 ; CHECK-NEXT: vlse32.v v9, (a5), a4 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a2, a2, -8 ; CHECK-NEXT: addi a1, a1, 512 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a5), a4 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: bnez a2, .LBB11_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -638,13 +638,13 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB12_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v8, (a1), a3 +; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v9, (a4), a3 ; V-NEXT: addi a4, a0, 16 ; V-NEXT: vse64.v v8, (a0) -; V-NEXT: vse64.v v9, (a4) ; V-NEXT: addi a0, a0, 32 +; V-NEXT: vse64.v v9, (a4) ; V-NEXT: addi a1, a1, 160 ; V-NEXT: bne a0, a2, .LBB12_1 ; V-NEXT: # %bb.2: # %bb18 @@ -653,15 +653,16 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; ZVE32F-LABEL: gather_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 -; ZVE32F-NEXT: lui a3, 2 -; ZVE32F-NEXT: add a3, a0, a3 -; ZVE32F-NEXT: li a4, 1 +; ZVE32F-NEXT: lui a4, 2 +; ZVE32F-NEXT: li a3, 1 +; ZVE32F-NEXT: add a4, a0, a4 ; ZVE32F-NEXT: li a5, 40 ; ZVE32F-NEXT: .LBB12_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 -; ZVE32F-NEXT: mul a6, a4, a5 -; ZVE32F-NEXT: add a6, a1, a6 +; ZVE32F-NEXT: mul a6, a3, a5 ; ZVE32F-NEXT: mul a7, a2, a5 +; ZVE32F-NEXT: addi a2, a2, 4 +; ZVE32F-NEXT: add a6, a1, a6 ; ZVE32F-NEXT: add a7, a1, a7 ; ZVE32F-NEXT: ld t0, 0(a7) ; ZVE32F-NEXT: ld t1, 0(a6) @@ -671,10 +672,9 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; ZVE32F-NEXT: sd t1, 8(a0) ; ZVE32F-NEXT: sd a7, 16(a0) ; ZVE32F-NEXT: sd a6, 24(a0) -; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: addi a0, a0, 32 -; ZVE32F-NEXT: addi a4, a4, 4 -; ZVE32F-NEXT: bne a0, a3, .LBB12_1 +; ZVE32F-NEXT: addi a3, a3, 4 +; ZVE32F-NEXT: bne a0, a4, .LBB12_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret bb: @@ -719,9 +719,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; V-NEXT: vle64.v v8, (a1) ; V-NEXT: vle64.v v9, (a4) ; V-NEXT: addi a4, a0, 80 +; V-NEXT: addi a1, a1, 32 ; V-NEXT: vsse64.v v8, (a0), a3 ; V-NEXT: vsse64.v v9, (a4), a3 -; V-NEXT: addi a1, a1, 32 ; V-NEXT: addi a0, a0, 160 ; V-NEXT: bne a1, a2, .LBB13_1 ; V-NEXT: # %bb.2: # %bb18 @@ -730,9 +730,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; ZVE32F-LABEL: scatter_of_pointers: ; ZVE32F: # %bb.0: # %bb ; ZVE32F-NEXT: li a2, 0 -; ZVE32F-NEXT: lui a3, 2 -; ZVE32F-NEXT: add a3, a1, a3 -; ZVE32F-NEXT: li a4, 1 +; ZVE32F-NEXT: lui a4, 2 +; ZVE32F-NEXT: li a3, 1 +; ZVE32F-NEXT: add a4, a1, a4 ; ZVE32F-NEXT: li a5, 40 ; ZVE32F-NEXT: .LBB13_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 @@ -740,18 +740,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; ZVE32F-NEXT: ld a7, 8(a1) ; ZVE32F-NEXT: ld t0, 16(a1) ; ZVE32F-NEXT: ld t1, 24(a1) -; ZVE32F-NEXT: mul t2, a4, a5 -; ZVE32F-NEXT: add t2, a0, t2 +; ZVE32F-NEXT: mul t2, a3, a5 ; ZVE32F-NEXT: mul t3, a2, a5 +; ZVE32F-NEXT: addi a2, a2, 4 +; ZVE32F-NEXT: addi a1, a1, 32 +; ZVE32F-NEXT: add t2, a0, t2 ; ZVE32F-NEXT: add t3, a0, t3 ; ZVE32F-NEXT: sd a6, 0(t3) ; ZVE32F-NEXT: sd a7, 0(t2) ; ZVE32F-NEXT: sd t0, 80(t3) ; ZVE32F-NEXT: sd t1, 80(t2) -; ZVE32F-NEXT: addi a2, a2, 4 -; ZVE32F-NEXT: addi a1, a1, 32 -; ZVE32F-NEXT: addi a4, a4, 4 -; ZVE32F-NEXT: bne a1, a3, .LBB13_1 +; ZVE32F-NEXT: addi a3, a3, 4 +; ZVE32F-NEXT: bne a1, a4, .LBB13_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret bb: @@ -794,36 +794,36 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt ; CHECK-NEXT: mv a4, a2 ; CHECK-NEXT: bltu a5, a6, .LBB14_5 ; CHECK-NEXT: # %bb.2: # %bb9 -; CHECK-NEXT: slli a5, a5, 32 -; CHECK-NEXT: srli a5, a5, 32 -; CHECK-NEXT: addi a5, a5, 1 -; CHECK-NEXT: andi a6, a5, -32 -; CHECK-NEXT: add a4, a6, a2 +; CHECK-NEXT: slli a4, a5, 32 ; CHECK-NEXT: slli t0, a2, 2 -; CHECK-NEXT: add a7, a0, a2 -; CHECK-NEXT: add a2, a1, a2 -; CHECK-NEXT: add a2, a2, t0 -; CHECK-NEXT: add t0, a4, a0 +; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: add a6, a1, a2 ; CHECK-NEXT: li t2, 32 +; CHECK-NEXT: srli a4, a4, 32 +; CHECK-NEXT: add t0, a6, t0 +; CHECK-NEXT: addi a6, a4, 1 +; CHECK-NEXT: andi a7, a6, -32 +; CHECK-NEXT: add a4, a7, a2 +; CHECK-NEXT: add a2, a4, a0 ; CHECK-NEXT: li t1, 5 ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma ; CHECK-NEXT: .LBB14_3: # %bb15 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vlse8.v v8, (a2), t1 -; CHECK-NEXT: vle8.v v9, (a7) +; CHECK-NEXT: vlse8.v v8, (t0), t1 +; CHECK-NEXT: vle8.v v9, (a5) ; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vse8.v v8, (a7) -; CHECK-NEXT: addi a7, a7, 32 -; CHECK-NEXT: addi a2, a2, 160 -; CHECK-NEXT: bne a7, t0, .LBB14_3 +; CHECK-NEXT: vse8.v v8, (a5) +; CHECK-NEXT: addi a5, a5, 32 +; CHECK-NEXT: addi t0, t0, 160 +; CHECK-NEXT: bne a5, a2, .LBB14_3 ; CHECK-NEXT: # %bb.4: # %bb30 -; CHECK-NEXT: beq a5, a6, .LBB14_7 +; CHECK-NEXT: beq a6, a7, .LBB14_7 ; CHECK-NEXT: .LBB14_5: # %bb32 ; CHECK-NEXT: add a2, a0, a4 ; CHECK-NEXT: slli a5, a4, 2 ; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: srli a3, a3, 32 ; CHECK-NEXT: add a0, a4, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 47efa058df641..1c2c90478a1f7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -542,20 +542,20 @@ declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr, i32, < define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind { ; CHECK-LABEL: strided_vpload_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: mv a3, a2 ; CHECK-NEXT: bltu a2, a4, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: mul a4, a3, a1 -; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: addi a5, a2, -16 -; CHECK-NEXT: sltu a2, a2, a5 -; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v9, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: sltu a2, a2, a5 +; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a5 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma @@ -578,8 +578,8 @@ define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext % ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: .LBB46_2: ; CHECK-NEXT: mul a4, a3, a1 -; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: addi a5, a2, -16 +; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: sltu a2, a2, a5 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a5 @@ -598,8 +598,8 @@ declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr, i32, define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) { ; CHECK-RV32-LABEL: strided_load_v33f64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a5, 32 ; CHECK-RV32-NEXT: vmv1r.v v8, v0 +; CHECK-RV32-NEXT: li a5, 32 ; CHECK-RV32-NEXT: mv a3, a4 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_2 ; CHECK-RV32-NEXT: # %bb.1: @@ -637,19 +637,19 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: vmv1r.v v0, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t +; CHECK-RV32-NEXT: addi a1, a0, 128 +; CHECK-RV32-NEXT: addi a2, a0, 256 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV32-NEXT: vse64.v v8, (a0) -; CHECK-RV32-NEXT: addi a1, a0, 128 ; CHECK-RV32-NEXT: vse64.v v24, (a1) -; CHECK-RV32-NEXT: addi a0, a0, 256 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vse64.v v16, (a0) +; CHECK-RV32-NEXT: vse64.v v16, (a2) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: strided_load_v33f64: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a5, 32 ; CHECK-RV64-NEXT: vmv1r.v v8, v0 +; CHECK-RV64-NEXT: li a5, 32 ; CHECK-RV64-NEXT: mv a4, a3 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_2 ; CHECK-RV64-NEXT: # %bb.1: @@ -687,13 +687,13 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: vmv1r.v v0, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t +; CHECK-RV64-NEXT: addi a1, a0, 128 +; CHECK-RV64-NEXT: addi a2, a0, 256 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV64-NEXT: vse64.v v8, (a0) -; CHECK-RV64-NEXT: addi a1, a0, 128 ; CHECK-RV64-NEXT: vse64.v v24, (a1) -; CHECK-RV64-NEXT: addi a0, a0, 256 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vse64.v v16, (a0) +; CHECK-RV64-NEXT: vse64.v v16, (a2) ; CHECK-RV64-NEXT: ret %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl) ret <33 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index ad55f276a74c8..12893ec55cda7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -227,35 +227,36 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 6 +; CHECK-NEXT: li a3, 72 +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 72 * vlenb ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v25, v0, 8 +; CHECK-NEXT: vslidedown.vi v6, v0, 8 ; CHECK-NEXT: addi a2, a1, 512 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v27, v25, 4 ; CHECK-NEXT: addi a3, a1, 640 +; CHECK-NEXT: addi a4, a7, -64 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v27, v6, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a3) +; CHECK-NEXT: sltu a3, a7, a4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v27, 2 -; CHECK-NEXT: addi a3, a7, -64 -; CHECK-NEXT: sltu a4, a7, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a4, a4, a3 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a4, a3, a4 ; CHECK-NEXT: addi a3, a4, -32 ; CHECK-NEXT: sltu a5, a4, a3 ; CHECK-NEXT: addi a5, a5, -1 @@ -267,8 +268,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: li a6, 24 -; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: slli a5, a5, 4 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 ; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill @@ -282,23 +282,22 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: .LBB16_2: +; CHECK-NEXT: vmv1r.v v0, v27 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a5) ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: li a6, 48 +; CHECK-NEXT: li a6, 56 ; CHECK-NEXT: mul a5, a5, a6 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 ; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v28, v26, 2 +; CHECK-NEXT: vslidedown.vi v27, v26, 2 ; CHECK-NEXT: li a5, 64 -; CHECK-NEXT: vmv1r.v v0, v27 ; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: li a6, 56 -; CHECK-NEXT: mul a3, a3, a6 +; CHECK-NEXT: slli a3, a3, 6 ; CHECK-NEXT: add a3, sp, a3 ; CHECK-NEXT: addi a3, a3, 16 ; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -307,12 +306,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a6, 64 ; CHECK-NEXT: .LBB16_4: +; CHECK-NEXT: vmv1r.v v0, v27 ; CHECK-NEXT: addi a5, a1, 384 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: csrr t0, vlenb -; CHECK-NEXT: slli t0, t0, 3 +; CHECK-NEXT: li t1, 48 +; CHECK-NEXT: mul t0, t0, t1 ; CHECK-NEXT: add t0, sp, t0 ; CHECK-NEXT: addi t0, t0, 16 ; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill @@ -324,9 +325,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: sltu t1, a6, t0 ; CHECK-NEXT: addi t1, t1, -1 ; CHECK-NEXT: and t0, t1, t0 -; CHECK-NEXT: vmv1r.v v0, v28 ; CHECK-NEXT: csrr t1, vlenb -; CHECK-NEXT: li t2, 48 +; CHECK-NEXT: li t2, 56 ; CHECK-NEXT: mul t1, t1, t2 ; CHECK-NEXT: add t1, sp, t1 ; CHECK-NEXT: addi t1, t1, 16 @@ -334,7 +334,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t ; CHECK-NEXT: csrr t0, vlenb -; CHECK-NEXT: slli t0, t0, 4 +; CHECK-NEXT: slli t0, t0, 3 ; CHECK-NEXT: add t0, sp, t0 ; CHECK-NEXT: addi t0, t0, 16 ; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill @@ -342,123 +342,145 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: li a6, 16 ; CHECK-NEXT: .LBB16_6: +; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a5) ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a1, 256 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v27, v25, 2 -; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vslidedown.vi v26, v6, 2 ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: li t0, 48 +; CHECK-NEXT: mul a5, a5, t0 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t +; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: li a6, 48 +; CHECK-NEXT: li a6, 56 ; CHECK-NEXT: mul a5, a5, a6 ; CHECK-NEXT: add a5, sp, a5 ; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: mv a5, a4 ; CHECK-NEXT: bltu a4, a3, .LBB16_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: li a5, 32 ; CHECK-NEXT: .LBB16_8: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a5, -16 ; CHECK-NEXT: sltu a5, a5, a1 ; CHECK-NEXT: addi a5, a5, -1 ; CHECK-NEXT: and a1, a5, a1 -; CHECK-NEXT: vmv1r.v v0, v27 +; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: addi a5, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t +; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a5, 40 +; CHECK-NEXT: mul a1, a1, a5 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a4, a2, .LBB16_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB16_10: +; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v6, v7, 2 -; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vslidedown.vi v25, v7, 2 +; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v16, 0, v0.t ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: li a4, 48 +; CHECK-NEXT: mul a1, a1, a4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t -; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: mv a1, a7 ; CHECK-NEXT: bltu a7, a3, .LBB16_12 ; CHECK-NEXT: # %bb.11: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB16_12: +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 24 -; CHECK-NEXT: mul a4, a4, a5 +; CHECK-NEXT: slli a4, a4, 4 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v24, v8 +; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v24, v16 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 56 -; CHECK-NEXT: mul a4, a4, a5 +; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v8, v24, 16 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 56 +; CHECK-NEXT: li a5, 40 ; CHECK-NEXT: mul a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 -; CHECK-NEXT: add a4, sp, a4 -; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v24, v8 +; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 48 +; CHECK-NEXT: li a5, 40 ; CHECK-NEXT: mul a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v8, v24, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 48 -; CHECK-NEXT: mul a4, a4, a5 +; CHECK-NEXT: slli a4, a4, 6 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; CHECK-NEXT: vmv4r.v v8, v0 -; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vslideup.vi v16, v24, 16 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: li a5, 24 -; CHECK-NEXT: mul a4, a4, a5 +; CHECK-NEXT: slli a4, a4, 6 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: addi a4, a1, -16 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a6, 56 +; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vslideup.vi v16, v8, 16 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a6, 56 +; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a6, 48 +; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a6, 40 +; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a6, 48 +; CHECK-NEXT: mul a5, a5, a6 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: sltu a1, a1, a4 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a1, a1, a4 -; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 5 +; CHECK-NEXT: li a5, 24 +; CHECK-NEXT: mul a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload @@ -470,19 +492,18 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: .LBB16_14: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 40 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t +; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: vslideup.vi v24, v8, 16 +; CHECK-NEXT: vse32.v v24, (a0) ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: li a3, 48 ; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 @@ -490,7 +511,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 48 +; CHECK-NEXT: li a3, 56 ; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 @@ -498,14 +519,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 384 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 56 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 6 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 6 +; CHECK-NEXT: li a1, 72 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index 293b75dc207c8..db03dc3d5ab1e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -313,10 +313,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) ; RV32-SLOW-NEXT: .LBB6_8: # %cond.store5 ; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 3 +; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32-SLOW-NEXT: vslidedown.vi v9, v9, 3 +; RV32-SLOW-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV32-SLOW-NEXT: vmv.x.s a0, v8 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32-SLOW-NEXT: vslidedown.vi v8, v9, 3 -; RV32-SLOW-NEXT: vmv.x.s a1, v8 +; RV32-SLOW-NEXT: vmv.x.s a1, v9 ; RV32-SLOW-NEXT: srli a2, a0, 8 ; RV32-SLOW-NEXT: sb a0, 0(a1) ; RV32-SLOW-NEXT: sb a2, 1(a1) @@ -376,10 +378,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) ; RV64-SLOW-NEXT: .LBB6_8: # %cond.store5 ; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 3 +; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-SLOW-NEXT: vslidedown.vi v10, v10, 3 +; RV64-SLOW-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64-SLOW-NEXT: vmv.x.s a0, v8 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-SLOW-NEXT: vslidedown.vi v8, v10, 3 -; RV64-SLOW-NEXT: vmv.x.s a1, v8 +; RV64-SLOW-NEXT: vmv.x.s a1, v10 ; RV64-SLOW-NEXT: srli a2, a0, 8 ; RV64-SLOW-NEXT: sb a0, 0(a1) ; RV64-SLOW-NEXT: sb a2, 1(a1) @@ -426,9 +430,9 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) ; RV32-SLOW-NEXT: .LBB7_4: # %cond.store1 ; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1 +; RV32-SLOW-NEXT: vslidedown.vi v9, v9, 1 ; RV32-SLOW-NEXT: vmv.x.s a0, v8 -; RV32-SLOW-NEXT: vslidedown.vi v8, v9, 1 -; RV32-SLOW-NEXT: vmv.x.s a1, v8 +; RV32-SLOW-NEXT: vmv.x.s a1, v9 ; RV32-SLOW-NEXT: srli a2, a0, 16 ; RV32-SLOW-NEXT: sh a0, 0(a1) ; RV32-SLOW-NEXT: sh a2, 2(a1) @@ -458,10 +462,12 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) ; RV64-SLOW-NEXT: .LBB7_4: # %cond.store1 ; RV64-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 +; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-SLOW-NEXT: vslidedown.vi v9, v9, 1 +; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-SLOW-NEXT: vmv.x.s a0, v8 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64-SLOW-NEXT: vslidedown.vi v8, v9, 1 -; RV64-SLOW-NEXT: vmv.x.s a1, v8 +; RV64-SLOW-NEXT: vmv.x.s a1, v9 ; RV64-SLOW-NEXT: srli a2, a0, 16 ; RV64-SLOW-NEXT: sh a0, 0(a1) ; RV64-SLOW-NEXT: sh a2, 2(a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll index 4477ce73a9ffc..fa82065f3b413 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll @@ -301,38 +301,38 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: bltu a2, a1, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfsgnj.vv v16, v16, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll index 4e9862b05f408..dfd509062ccf7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll @@ -57,8 +57,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -73,8 +73,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -102,8 +102,8 @@ define <1 x i1> @fcmp_oge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -118,8 +118,8 @@ define <1 x i1> @fcmp_oge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -147,8 +147,8 @@ define <1 x i1> @fcmp_olt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -163,8 +163,8 @@ define <1 x i1> @fcmp_olt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -192,8 +192,8 @@ define <1 x i1> @fcmp_ole_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -208,8 +208,8 @@ define <1 x i1> @fcmp_ole_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -240,8 +240,8 @@ define <1 x i1> @fcmp_one_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -259,8 +259,8 @@ define <1 x i1> @fcmp_one_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -336,8 +336,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -355,8 +355,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -388,8 +388,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -405,8 +405,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -436,8 +436,8 @@ define <1 x i1> @fcmp_uge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -453,8 +453,8 @@ define <1 x i1> @fcmp_uge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -484,8 +484,8 @@ define <1 x i1> @fcmp_ult_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -501,8 +501,8 @@ define <1 x i1> @fcmp_ult_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -532,8 +532,8 @@ define <1 x i1> @fcmp_ule_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -549,8 +549,8 @@ define <1 x i1> @fcmp_ule_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -690,8 +690,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -706,8 +706,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -735,8 +735,8 @@ define <2 x i1> @fcmp_oge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -751,8 +751,8 @@ define <2 x i1> @fcmp_oge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -780,8 +780,8 @@ define <2 x i1> @fcmp_olt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -796,8 +796,8 @@ define <2 x i1> @fcmp_olt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -825,8 +825,8 @@ define <2 x i1> @fcmp_ole_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -841,8 +841,8 @@ define <2 x i1> @fcmp_ole_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -873,8 +873,8 @@ define <2 x i1> @fcmp_one_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -892,8 +892,8 @@ define <2 x i1> @fcmp_one_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -969,8 +969,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -988,8 +988,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1021,8 +1021,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1038,8 +1038,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1069,8 +1069,8 @@ define <2 x i1> @fcmp_uge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1086,8 +1086,8 @@ define <2 x i1> @fcmp_uge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1117,8 +1117,8 @@ define <2 x i1> @fcmp_ult_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1134,8 +1134,8 @@ define <2 x i1> @fcmp_ult_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1165,8 +1165,8 @@ define <2 x i1> @fcmp_ule_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1182,8 +1182,8 @@ define <2 x i1> @fcmp_ule_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1323,8 +1323,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1339,8 +1339,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1368,8 +1368,8 @@ define <4 x i1> @fcmp_oge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1384,8 +1384,8 @@ define <4 x i1> @fcmp_oge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1413,8 +1413,8 @@ define <4 x i1> @fcmp_olt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1429,8 +1429,8 @@ define <4 x i1> @fcmp_olt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1458,8 +1458,8 @@ define <4 x i1> @fcmp_ole_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1474,8 +1474,8 @@ define <4 x i1> @fcmp_ole_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1506,8 +1506,8 @@ define <4 x i1> @fcmp_one_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -1525,8 +1525,8 @@ define <4 x i1> @fcmp_one_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1602,8 +1602,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -1621,8 +1621,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1654,8 +1654,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1671,8 +1671,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1702,8 +1702,8 @@ define <4 x i1> @fcmp_uge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1719,8 +1719,8 @@ define <4 x i1> @fcmp_uge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1750,8 +1750,8 @@ define <4 x i1> @fcmp_ult_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1767,8 +1767,8 @@ define <4 x i1> @fcmp_ult_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1798,8 +1798,8 @@ define <4 x i1> @fcmp_ule_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1815,8 +1815,8 @@ define <4 x i1> @fcmp_ule_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1956,8 +1956,8 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1972,8 +1972,8 @@ define <8 x i1> @fcmp_ogt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2001,8 +2001,8 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2017,8 +2017,8 @@ define <8 x i1> @fcmp_oge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2046,8 +2046,8 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2062,8 +2062,8 @@ define <8 x i1> @fcmp_olt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2091,8 +2091,8 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2107,8 +2107,8 @@ define <8 x i1> @fcmp_ole_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -2139,8 +2139,8 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -2158,8 +2158,8 @@ define <8 x i1> @fcmp_one_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -2235,8 +2235,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -2254,8 +2254,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -2287,8 +2287,8 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2304,8 +2304,8 @@ define <8 x i1> @fcmp_ugt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2335,8 +2335,8 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2352,8 +2352,8 @@ define <8 x i1> @fcmp_uge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2383,8 +2383,8 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2400,8 +2400,8 @@ define <8 x i1> @fcmp_ult_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2431,8 +2431,8 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2448,8 +2448,8 @@ define <8 x i1> @fcmp_ule_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2797,9 +2797,9 @@ define <16 x i1> @fcmp_one_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -2817,9 +2817,9 @@ define <16 x i1> @fcmp_one_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -2896,9 +2896,9 @@ define <16 x i1> @fcmp_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -2916,9 +2916,9 @@ define <16 x i1> @fcmp_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -3981,8 +3981,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -3997,8 +3997,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4026,8 +4026,8 @@ define <1 x i1> @fcmp_oge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4042,8 +4042,8 @@ define <1 x i1> @fcmp_oge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4071,8 +4071,8 @@ define <1 x i1> @fcmp_olt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4087,8 +4087,8 @@ define <1 x i1> @fcmp_olt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4116,8 +4116,8 @@ define <1 x i1> @fcmp_ole_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4132,8 +4132,8 @@ define <1 x i1> @fcmp_ole_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4164,8 +4164,8 @@ define <1 x i1> @fcmp_one_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4183,8 +4183,8 @@ define <1 x i1> @fcmp_one_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4260,8 +4260,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4279,8 +4279,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4312,8 +4312,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4329,8 +4329,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4360,8 +4360,8 @@ define <1 x i1> @fcmp_uge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4377,8 +4377,8 @@ define <1 x i1> @fcmp_uge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4408,8 +4408,8 @@ define <1 x i1> @fcmp_ult_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4425,8 +4425,8 @@ define <1 x i1> @fcmp_ult_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4456,8 +4456,8 @@ define <1 x i1> @fcmp_ule_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4473,8 +4473,8 @@ define <1 x i1> @fcmp_ule_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4614,8 +4614,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4630,8 +4630,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4659,8 +4659,8 @@ define <2 x i1> @fcmp_oge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4675,8 +4675,8 @@ define <2 x i1> @fcmp_oge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4704,8 +4704,8 @@ define <2 x i1> @fcmp_olt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4720,8 +4720,8 @@ define <2 x i1> @fcmp_olt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4749,8 +4749,8 @@ define <2 x i1> @fcmp_ole_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4765,8 +4765,8 @@ define <2 x i1> @fcmp_ole_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4797,8 +4797,8 @@ define <2 x i1> @fcmp_one_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4816,8 +4816,8 @@ define <2 x i1> @fcmp_one_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4893,8 +4893,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4912,8 +4912,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4945,8 +4945,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4962,8 +4962,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4993,8 +4993,8 @@ define <2 x i1> @fcmp_uge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5010,8 +5010,8 @@ define <2 x i1> @fcmp_uge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5041,8 +5041,8 @@ define <2 x i1> @fcmp_ult_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5058,8 +5058,8 @@ define <2 x i1> @fcmp_ult_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5089,8 +5089,8 @@ define <2 x i1> @fcmp_ule_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5106,8 +5106,8 @@ define <2 x i1> @fcmp_ule_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5247,8 +5247,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5263,8 +5263,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5292,8 +5292,8 @@ define <4 x i1> @fcmp_oge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5308,8 +5308,8 @@ define <4 x i1> @fcmp_oge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5337,8 +5337,8 @@ define <4 x i1> @fcmp_olt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5353,8 +5353,8 @@ define <4 x i1> @fcmp_olt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5382,8 +5382,8 @@ define <4 x i1> @fcmp_ole_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5398,8 +5398,8 @@ define <4 x i1> @fcmp_ole_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -5430,8 +5430,8 @@ define <4 x i1> @fcmp_one_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -5449,8 +5449,8 @@ define <4 x i1> @fcmp_one_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -5526,8 +5526,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -5545,8 +5545,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -5578,8 +5578,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5595,8 +5595,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5626,8 +5626,8 @@ define <4 x i1> @fcmp_uge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5643,8 +5643,8 @@ define <4 x i1> @fcmp_uge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5674,8 +5674,8 @@ define <4 x i1> @fcmp_ult_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5691,8 +5691,8 @@ define <4 x i1> @fcmp_ult_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5722,8 +5722,8 @@ define <4 x i1> @fcmp_ule_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5739,8 +5739,8 @@ define <4 x i1> @fcmp_ule_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -6088,9 +6088,9 @@ define <8 x i1> @fcmp_one_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -6108,9 +6108,9 @@ define <8 x i1> @fcmp_one_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -6187,9 +6187,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -6207,9 +6207,9 @@ define <8 x i1> @fcmp_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -7230,8 +7230,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7246,8 +7246,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7275,8 +7275,8 @@ define <1 x i1> @fcmp_oge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7291,8 +7291,8 @@ define <1 x i1> @fcmp_oge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7320,8 +7320,8 @@ define <1 x i1> @fcmp_olt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7336,8 +7336,8 @@ define <1 x i1> @fcmp_olt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7365,8 +7365,8 @@ define <1 x i1> @fcmp_ole_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7381,8 +7381,8 @@ define <1 x i1> @fcmp_ole_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7413,8 +7413,8 @@ define <1 x i1> @fcmp_one_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -7432,8 +7432,8 @@ define <1 x i1> @fcmp_one_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -7509,8 +7509,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -7528,8 +7528,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -7561,8 +7561,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7578,8 +7578,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7609,8 +7609,8 @@ define <1 x i1> @fcmp_uge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7626,8 +7626,8 @@ define <1 x i1> @fcmp_uge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7657,8 +7657,8 @@ define <1 x i1> @fcmp_ult_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7674,8 +7674,8 @@ define <1 x i1> @fcmp_ult_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7705,8 +7705,8 @@ define <1 x i1> @fcmp_ule_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7722,8 +7722,8 @@ define <1 x i1> @fcmp_ule_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7863,8 +7863,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7879,8 +7879,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7908,8 +7908,8 @@ define <2 x i1> @fcmp_oge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7924,8 +7924,8 @@ define <2 x i1> @fcmp_oge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7953,8 +7953,8 @@ define <2 x i1> @fcmp_olt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7969,8 +7969,8 @@ define <2 x i1> @fcmp_olt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7998,8 +7998,8 @@ define <2 x i1> @fcmp_ole_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -8014,8 +8014,8 @@ define <2 x i1> @fcmp_ole_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -8046,8 +8046,8 @@ define <2 x i1> @fcmp_one_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -8065,8 +8065,8 @@ define <2 x i1> @fcmp_one_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -8142,8 +8142,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -8161,8 +8161,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -8194,8 +8194,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8211,8 +8211,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8242,8 +8242,8 @@ define <2 x i1> @fcmp_uge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8259,8 +8259,8 @@ define <2 x i1> @fcmp_uge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8290,8 +8290,8 @@ define <2 x i1> @fcmp_ult_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8307,8 +8307,8 @@ define <2 x i1> @fcmp_ult_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8338,8 +8338,8 @@ define <2 x i1> @fcmp_ule_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8355,8 +8355,8 @@ define <2 x i1> @fcmp_ule_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8704,9 +8704,9 @@ define <4 x i1> @fcmp_one_vf_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -8724,9 +8724,9 @@ define <4 x i1> @fcmp_one_fv_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -8803,9 +8803,9 @@ define <4 x i1> @fcmp_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -8823,9 +8823,9 @@ define <4 x i1> @fcmp_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll index 97641ff6d92d7..472f2073667db 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll @@ -509,8 +509,8 @@ define <1 x i1> @fcmps_uno_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -1041,8 +1041,8 @@ define <2 x i1> @fcmps_uno_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -1573,8 +1573,8 @@ define <4 x i1> @fcmps_uno_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -2105,8 +2105,8 @@ define <8 x i1> @fcmps_uno_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -2637,10 +2637,10 @@ define <16 x i1> @fcmps_uno_vf_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -3210,10 +3210,10 @@ define <32 x i1> @fcmps_uno_vf_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3743,8 +3743,8 @@ define <1 x i1> @fcmps_uno_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -4275,8 +4275,8 @@ define <2 x i1> @fcmps_uno_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -4807,8 +4807,8 @@ define <4 x i1> @fcmps_uno_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -5339,10 +5339,10 @@ define <8 x i1> @fcmps_uno_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5871,10 +5871,10 @@ define <16 x i1> @fcmps_uno_vf_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -6403,8 +6403,8 @@ define <1 x i1> @fcmps_uno_vf_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -6935,8 +6935,8 @@ define <2 x i1> @fcmps_uno_vf_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -7467,10 +7467,10 @@ define <4 x i1> @fcmps_uno_vf_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7999,10 +7999,10 @@ define <8 x i1> @fcmps_uno_vf_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll index f03f1ec639eb6..c61f9cd9b5bd7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -66,12 +66,12 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -92,12 +92,12 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -165,12 +165,12 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -191,12 +191,12 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -264,14 +264,14 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v14, v12, v10, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -290,14 +290,14 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v14, v12, v10 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -363,14 +363,14 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v20, v16, v12, v0.t ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -389,14 +389,14 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 +; ZVFHMIN-NEXT: vfmadd.vv v20, v16, v12 ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -822,31 +822,31 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a3, 24 ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma @@ -855,12 +855,18 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 ; CHECK-NEXT: addi a1, a1, -1 @@ -871,20 +877,21 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 @@ -907,24 +914,24 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v24, (a2) ; CHECK-NEXT: vle64.v v0, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 @@ -936,9 +943,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v8, v24 +; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 ; CHECK-NEXT: addi a1, a1, -1 @@ -947,9 +954,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v24, v16, v8 ; CHECK-NEXT: vmv8r.v v8, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll index 3be992b7e0c3b..cad7adbc19f3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll @@ -393,38 +393,38 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: bltu a2, a1, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t +; CHECK-NEXT: vfmax.vv v8, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll index 49a774c29e432..d8ee7a7044b49 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll @@ -393,38 +393,38 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: bltu a2, a1, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t +; CHECK-NEXT: vfmin.vv v8, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll index e2189b55c8bf2..a5d9b3439e29b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll @@ -610,31 +610,31 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a3, 24 ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma @@ -643,12 +643,18 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 ; CHECK-NEXT: addi a1, a1, -1 @@ -659,20 +665,21 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 @@ -695,24 +702,24 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v24, (a2) ; CHECK-NEXT: vle64.v v0, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a4 @@ -724,9 +731,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v8, v24 +; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 ; CHECK-NEXT: addi a1, a1, -1 @@ -735,9 +742,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v24, v16, v8 ; CHECK-NEXT: vmv8r.v v8, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index 1144f776e7fbf..aba9056c78cda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -118,11 +118,11 @@ define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; FOLDING-NEXT: vfwmul.vv v12, v8, v9 +; FOLDING-NEXT: vfwsub.vv v13, v9, v10 ; FOLDING-NEXT: vfwmacc.vv v11, v8, v10 -; FOLDING-NEXT: vfwsub.vv v8, v9, v10 ; FOLDING-NEXT: vse64.v v12, (a0) ; FOLDING-NEXT: vse64.v v11, (a1) -; FOLDING-NEXT: vse64.v v8, (a2) +; FOLDING-NEXT: vse64.v v13, (a2) ; FOLDING-NEXT: ret %c = fpext <2 x float> %a to <2 x double> %d = fpext <2 x float> %b to <2 x double> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 88ae643ca742e..24e75cde2ce91 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -296,15 +296,15 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> % ; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma ; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64-NEXT: addi a2, a1, -16 -; RV64-NEXT: sltu a1, a1, a2 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: li a0, 32 @@ -2052,11 +2052,11 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t ; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -2103,11 +2103,11 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2119,10 +2119,10 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB95_2 @@ -2161,11 +2161,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2178,10 +2178,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB96_2 @@ -2221,11 +2221,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2247,11 +2247,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v16, v0.t ; RV64-NEXT: addi a2, a1, -16 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a2 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2279,11 +2279,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2295,10 +2295,10 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v16, v12 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB98_2 @@ -2337,11 +2337,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2354,10 +2354,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v8 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB99_2 @@ -2397,11 +2397,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2423,11 +2423,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV64-NEXT: addi a2, a1, -16 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a2 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2454,11 +2454,11 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2470,10 +2470,10 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf2 v24, v16 -; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: vsext.vf2 v24, v8 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsext.vf2 v8, v16 +; RV64-NEXT: vsll.vi v16, v8, 3 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB101_2 @@ -2511,11 +2511,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2528,10 +2528,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vsext.vf2 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf2 v16, v8 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB102_2 @@ -2570,11 +2570,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2587,10 +2587,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vzext.vf2 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vzext.vf2 v16, v8 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB103_2 @@ -2622,17 +2622,17 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x ; RV32-NEXT: vnsrl.wi v24, v16, 0 ; RV32-NEXT: vnsrl.wi v16, v8, 0 ; RV32-NEXT: li a2, 32 +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vslideup.vi v16, v24, 16 ; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: sltu a2, a1, a3 +; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: li a2, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll index 351d7d4cd9b09..71f497e4c7be4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -374,11 +374,11 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0), v0.t ; CHECK-NEXT: addi a2, a1, -16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: sltu a1, a1, a2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a1, a1, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a0), v0.t @@ -394,19 +394,19 @@ declare <33 x double> @llvm.vp.load.v33f64.p0(ptr, <33 x i1>, i32) define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpload_v33f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: mv a3, a2 ; CHECK-NEXT: bltu a2, a4, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: addi a4, a3, -16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: sltu a3, a3, a4 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a4 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: addi a4, a1, 128 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a4), v0.t @@ -431,13 +431,13 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a1), v0.t +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vse64.v v16, (a1) -; CHECK-NEXT: addi a0, a0, 256 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v24, (a0) +; CHECK-NEXT: vse64.v v24, (a2) ; CHECK-NEXT: ret %load = call <33 x double> @llvm.vp.load.v33f64.p0(ptr %ptr, <33 x i1> %m, i32 %evl) ret <33 x double> %load diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index bc42b42208bc3..a11c2b6bca12e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -1176,46 +1176,37 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a1, .LBB79_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB79_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma -; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index 9e37780f56e1c..d691dcd5c54b6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1756,11 +1756,11 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t ; RV32-NEXT: addi a0, a1, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a1, a1, a0 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -1775,11 +1775,11 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: vle64.v v24, (a0) ; RV64-NEXT: li a1, 16 ; RV64-NEXT: mv a0, a2 @@ -1798,7 +1798,7 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -1826,11 +1826,11 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -1842,44 +1842,51 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: vmv1r.v v7, v0 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v24, 16 -; RV64-NEXT: vmv4r.v v0, v24 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v0, v24 ; RV64-NEXT: vsext.vf2 v24, v16 ; RV64-NEXT: vsll.vi v16, v24, 3 -; RV64-NEXT: vsext.vf2 v24, v0 -; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsll.vi v24, v0, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB84_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB84_2: -; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl1r.v v0, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v0, v7, 2 +; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: and a1, a2, a1 -; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -1906,11 +1913,11 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -1922,37 +1929,37 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a4, a3, 3 -; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf2 v0, v24 +; RV64-NEXT: vsext.vf2 v16, v24 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v24, 16 +; RV64-NEXT: vslidedown.vi v8, v24, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB85_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB85_2: ; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl1r.v v0, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 @@ -1960,14 +1967,14 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -1995,11 +2002,11 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma @@ -2011,37 +2018,37 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a4, a3, 3 -; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vzext.vf2 v0, v24 +; RV64-NEXT: vzext.vf2 v16, v24 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v24, 16 +; RV64-NEXT: vslidedown.vi v8, v24, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vzext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB86_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB86_2: ; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl1r.v v0, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 @@ -2049,14 +2056,14 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll index d34292abdce0d..8eaa5efe163cd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -292,11 +292,11 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0), v0.t ; CHECK-NEXT: addi a2, a1, -16 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: sltu a1, a1, a2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a1, a1, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v16, (a0), v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll index 418b159c8fb98..5975b0d0761eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll @@ -11,8 +11,8 @@ define <1 x i8> @vrol_vv_v1i8(<1 x i8> %a, <1 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -33,8 +33,8 @@ define <1 x i8> @vrol_vx_v1i8(<1 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -58,8 +58,8 @@ define <2 x i8> @vrol_vv_v2i8(<2 x i8> %a, <2 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -80,8 +80,8 @@ define <2 x i8> @vrol_vx_v2i8(<2 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -105,8 +105,8 @@ define <4 x i8> @vrol_vv_v4i8(<4 x i8> %a, <4 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -127,8 +127,8 @@ define <4 x i8> @vrol_vx_v4i8(<4 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -152,8 +152,8 @@ define <8 x i8> @vrol_vv_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -174,8 +174,8 @@ define <8 x i8> @vrol_vx_v8i8(<8 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -199,8 +199,8 @@ define <16 x i8> @vrol_vv_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -221,8 +221,8 @@ define <16 x i8> @vrol_vx_v16i8(<16 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -247,8 +247,8 @@ define <32 x i8> @vrol_vv_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -271,8 +271,8 @@ define <32 x i8> @vrol_vx_v32i8(<32 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -298,8 +298,8 @@ define <64 x i8> @vrol_vv_v64i8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -322,8 +322,8 @@ define <64 x i8> @vrol_vx_v64i8(<64 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -348,8 +348,8 @@ define <1 x i16> @vrol_vv_v1i16(<1 x i16> %a, <1 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -370,8 +370,8 @@ define <1 x i16> @vrol_vx_v1i16(<1 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -395,8 +395,8 @@ define <2 x i16> @vrol_vv_v2i16(<2 x i16> %a, <2 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -417,8 +417,8 @@ define <2 x i16> @vrol_vx_v2i16(<2 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -442,8 +442,8 @@ define <4 x i16> @vrol_vv_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -464,8 +464,8 @@ define <4 x i16> @vrol_vx_v4i16(<4 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -489,8 +489,8 @@ define <8 x i16> @vrol_vv_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -511,8 +511,8 @@ define <8 x i16> @vrol_vx_v8i16(<8 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -536,8 +536,8 @@ define <16 x i16> @vrol_vv_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -558,8 +558,8 @@ define <16 x i16> @vrol_vx_v16i16(<16 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -584,8 +584,8 @@ define <32 x i16> @vrol_vv_v32i16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -608,8 +608,8 @@ define <32 x i16> @vrol_vx_v32i16(<32 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -634,12 +634,12 @@ define <1 x i32> @vrol_vv_v1i32(<1 x i32> %a, <1 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v1i32: @@ -658,8 +658,8 @@ define <1 x i32> @vrol_vx_v1i32(<1 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -683,12 +683,12 @@ define <2 x i32> @vrol_vv_v2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v2i32: @@ -707,8 +707,8 @@ define <2 x i32> @vrol_vx_v2i32(<2 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -732,12 +732,12 @@ define <4 x i32> @vrol_vv_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v4i32: @@ -756,8 +756,8 @@ define <4 x i32> @vrol_vx_v4i32(<4 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -781,12 +781,12 @@ define <8 x i32> @vrol_vv_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsll.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vrsub.vi v12, v10, 0 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vsrl.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v8i32: @@ -805,8 +805,8 @@ define <8 x i32> @vrol_vx_v8i32(<8 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -830,12 +830,12 @@ define <16 x i32> @vrol_vv_v16i32(<16 x i32> %a, <16 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsll.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vrsub.vi v16, v12, 0 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsll.vv v12, v8, v12 +; CHECK-NEXT: vsrl.vv v8, v8, v16 +; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v16i32: @@ -854,8 +854,8 @@ define <16 x i32> @vrol_vx_v16i32(<16 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -879,12 +879,12 @@ define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v1i64: @@ -903,8 +903,8 @@ define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) { ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -928,12 +928,12 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsll.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v2i64: @@ -955,9 +955,9 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) { ; RV32-NEXT: vwsub.vx v11, v10, a0 ; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v9, a0 ; RV32-NEXT: vand.vx v10, v11, a0 ; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vand.vx v9, v9, a0 ; RV32-NEXT: vsll.vv v8, v8, v9 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret @@ -968,8 +968,8 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) { ; RV64-NEXT: vmv.v.x v9, a0 ; RV64-NEXT: li a0, 63 ; RV64-NEXT: vand.vx v10, v9, a0 -; RV64-NEXT: vsll.vv v10, v8, v10 ; RV64-NEXT: vrsub.vi v9, v9, 0 +; RV64-NEXT: vsll.vv v10, v8, v10 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsrl.vv v8, v8, v9 ; RV64-NEXT: vor.vv v8, v10, v8 @@ -993,12 +993,12 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsll.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vrsub.vi v12, v10, 0 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vsrl.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v4i64: @@ -1020,9 +1020,9 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) { ; RV32-NEXT: vwsub.vx v14, v12, a0 ; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v10, v10, a0 ; RV32-NEXT: vand.vx v12, v14, a0 ; RV32-NEXT: vsrl.vv v12, v8, v12 -; RV32-NEXT: vand.vx v10, v10, a0 ; RV32-NEXT: vsll.vv v8, v8, v10 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: ret @@ -1033,8 +1033,8 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) { ; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: li a0, 63 ; RV64-NEXT: vand.vx v12, v10, a0 -; RV64-NEXT: vsll.vv v12, v8, v12 ; RV64-NEXT: vrsub.vi v10, v10, 0 +; RV64-NEXT: vsll.vv v12, v8, v12 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsrl.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v12, v8 @@ -1058,12 +1058,12 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsll.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vrsub.vi v16, v12, 0 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsrl.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsll.vv v12, v8, v12 +; CHECK-NEXT: vsrl.vv v8, v8, v16 +; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vrol_vv_v8i64: @@ -1085,9 +1085,9 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) { ; RV32-NEXT: vwsub.vx v20, v16, a0 ; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vand.vx v12, v12, a0 ; RV32-NEXT: vand.vx v16, v20, a0 ; RV32-NEXT: vsrl.vv v16, v8, v16 -; RV32-NEXT: vand.vx v12, v12, a0 ; RV32-NEXT: vsll.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: ret @@ -1098,8 +1098,8 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) { ; RV64-NEXT: vmv.v.x v12, a0 ; RV64-NEXT: li a0, 63 ; RV64-NEXT: vand.vx v16, v12, a0 -; RV64-NEXT: vsll.vv v16, v8, v16 ; RV64-NEXT: vrsub.vi v12, v12, 0 +; RV64-NEXT: vsll.vv v16, v8, v16 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsrl.vv v8, v8, v12 ; RV64-NEXT: vor.vv v8, v16, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll index e4ddfeb4c4195..68a9e217ccd1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll @@ -12,8 +12,8 @@ define <1 x i8> @vror_vv_v1i8(<1 x i8> %a, <1 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -34,8 +34,8 @@ define <1 x i8> @vror_vx_v1i8(<1 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -96,8 +96,8 @@ define <2 x i8> @vror_vv_v2i8(<2 x i8> %a, <2 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -118,8 +118,8 @@ define <2 x i8> @vror_vx_v2i8(<2 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -180,8 +180,8 @@ define <4 x i8> @vror_vv_v4i8(<4 x i8> %a, <4 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -202,8 +202,8 @@ define <4 x i8> @vror_vx_v4i8(<4 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -264,8 +264,8 @@ define <8 x i8> @vror_vv_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -286,8 +286,8 @@ define <8 x i8> @vror_vx_v8i8(<8 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -348,8 +348,8 @@ define <16 x i8> @vror_vv_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -370,8 +370,8 @@ define <16 x i8> @vror_vx_v16i8(<16 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -433,8 +433,8 @@ define <32 x i8> @vror_vv_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -457,8 +457,8 @@ define <32 x i8> @vror_vx_v32i8(<32 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -525,8 +525,8 @@ define <64 x i8> @vror_vv_v64i8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -549,8 +549,8 @@ define <64 x i8> @vror_vx_v64i8(<64 x i8> %a, i8 %b) { ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -616,8 +616,8 @@ define <1 x i16> @vror_vv_v1i16(<1 x i16> %a, <1 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -638,8 +638,8 @@ define <1 x i16> @vror_vx_v1i16(<1 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -700,8 +700,8 @@ define <2 x i16> @vror_vv_v2i16(<2 x i16> %a, <2 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -722,8 +722,8 @@ define <2 x i16> @vror_vx_v2i16(<2 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -784,8 +784,8 @@ define <4 x i16> @vror_vv_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -806,8 +806,8 @@ define <4 x i16> @vror_vx_v4i16(<4 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -868,8 +868,8 @@ define <8 x i16> @vror_vv_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -890,8 +890,8 @@ define <8 x i16> @vror_vx_v8i16(<8 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -952,8 +952,8 @@ define <16 x i16> @vror_vv_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -974,8 +974,8 @@ define <16 x i16> @vror_vx_v16i16(<16 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -1037,8 +1037,8 @@ define <32 x i16> @vror_vv_v32i16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -1061,8 +1061,8 @@ define <32 x i16> @vror_vx_v32i16(<32 x i16> %a, i16 %b) { ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -1128,12 +1128,12 @@ define <1 x i32> @vror_vv_v1i32(<1 x i32> %a, <1 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v1i32: @@ -1152,8 +1152,8 @@ define <1 x i32> @vror_vx_v1i32(<1 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -1214,12 +1214,12 @@ define <2 x i32> @vror_vv_v2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v2i32: @@ -1238,8 +1238,8 @@ define <2 x i32> @vror_vx_v2i32(<2 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -1300,12 +1300,12 @@ define <4 x i32> @vror_vv_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v4i32: @@ -1324,8 +1324,8 @@ define <4 x i32> @vror_vx_v4i32(<4 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -1386,12 +1386,12 @@ define <8 x i32> @vror_vv_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsrl.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vrsub.vi v12, v10, 0 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsll.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vsll.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v8i32: @@ -1410,8 +1410,8 @@ define <8 x i32> @vror_vx_v8i32(<8 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -1472,12 +1472,12 @@ define <16 x i32> @vror_vv_v16i32(<16 x i32> %a, <16 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsrl.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vrsub.vi v16, v12, 0 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsll.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 +; CHECK-NEXT: vsll.vv v8, v8, v16 +; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v16i32: @@ -1496,8 +1496,8 @@ define <16 x i32> @vror_vx_v16i32(<16 x i32> %a, i32 %b) { ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: li a0, 31 ; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -1558,12 +1558,12 @@ define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v1i64: @@ -1582,8 +1582,8 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) { ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -1605,12 +1605,12 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vmv.s.x v10, a0 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 ; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret @@ -1638,12 +1638,12 @@ define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vmv.s.x v10, a0 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 ; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 ; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret @@ -1674,12 +1674,12 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vand.vx v10, v9, a0 -; CHECK-NEXT: vsrl.vv v10, v8, v10 -; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vrsub.vi v10, v9, 0 ; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v2i64: @@ -1701,9 +1701,9 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vand.vx v10, v11, a0 ; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 ; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret @@ -1714,8 +1714,8 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { ; CHECK-RV64-NEXT: vmv.v.x v9, a0 ; CHECK-RV64-NEXT: li a0, 63 ; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vand.vx v9, v9, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -1741,12 +1741,12 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vx v9, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vand.vi v9, v9, 1 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v2i64: @@ -1776,12 +1776,12 @@ define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vx v9, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vand.vi v9, v9, 1 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64: @@ -1810,12 +1810,12 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vand.vx v12, v10, a0 -; CHECK-NEXT: vsrl.vv v12, v8, v12 -; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vrsub.vi v12, v10, 0 ; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsll.vv v8, v8, v10 -; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vsll.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v4i64: @@ -1837,9 +1837,9 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vand.vx v12, v14, a0 ; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 ; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret @@ -1850,8 +1850,8 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { ; CHECK-RV64-NEXT: vmv.v.x v10, a0 ; CHECK-RV64-NEXT: li a0, 63 ; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV64-NEXT: vand.vx v10, v10, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v10 ; CHECK-RV64-NEXT: vor.vv v8, v12, v8 @@ -1877,12 +1877,12 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v12, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-RV32-NEXT: vand.vx v10, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v4i64: @@ -1912,12 +1912,12 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v12, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-RV32-NEXT: vand.vx v10, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64: @@ -1946,12 +1946,12 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vand.vx v16, v12, a0 -; CHECK-NEXT: vsrl.vv v16, v8, v16 -; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vrsub.vi v16, v12, 0 ; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsll.vv v8, v8, v12 -; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 +; CHECK-NEXT: vsll.vv v8, v8, v16 +; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: ret ; ; CHECK-ZVKB-LABEL: vror_vv_v8i64: @@ -1973,9 +1973,9 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vand.vx v16, v20, a0 ; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 ; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret @@ -1986,8 +1986,8 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { ; CHECK-RV64-NEXT: vmv.v.x v12, a0 ; CHECK-RV64-NEXT: li a0, 63 ; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 ; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 ; CHECK-RV64-NEXT: vand.vx v12, v12, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v12 ; CHECK-RV64-NEXT: vor.vv v8, v16, v8 @@ -2013,12 +2013,12 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v16, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-RV32-NEXT: vand.vx v12, v16, a0 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vmv.v.x v16, a0 -; CHECK-RV32-NEXT: vand.vi v16, v16, 1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v8i64: @@ -2048,12 +2048,12 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) { ; CHECK-RV32-NEXT: vwsubu.vx v16, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-RV32-NEXT: vand.vx v12, v16, a0 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vmv.v.x v16, a0 -; CHECK-RV32-NEXT: vand.vi v16, v16, 1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vsrl.vv v16, v8, v16 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll index 46e794b8a787b..888fc79f0122d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -11,13 +11,13 @@ define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 ; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsra.vi v9, v9, 1 ; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmin.vx v8, v8, a1, v0.t ; CHECK-NEXT: li a0, 192 ; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 18251eeef0f23..1d8af4c46cc07 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -158,48 +158,38 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: vmv1r.v v6, v8 +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a4, a1, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: addi a0, a1, 128 -; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: addi a0, a3, -128 +; CHECK-NEXT: vle8.v v8, (a4) ; CHECK-NEXT: sltu a4, a3, a0 -; CHECK-NEXT: vle8.v v0, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: addi a4, a4, -1 ; CHECK-NEXT: and a0, a4, a0 -; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 +; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 ; CHECK-NEXT: bltu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -219,43 +209,43 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c ; CHECK-NEXT: mul a2, a2, a3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v24, (a1) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 @@ -424,11 +414,11 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 @@ -447,7 +437,7 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -465,36 +455,48 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -611,12 +613,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> % ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a3, .LBB35_2 @@ -634,7 +636,7 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> % ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index a3bba2dd8265c..557882ee31d4c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -9,21 +9,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: andi a4, a2, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: slli a4, a2, 29 ; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a4, a4, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: slli a4, a2, 27 +; RV32-NEXT: srli a2, a2, 5 ; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 27 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -39,21 +39,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) ; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: andi a4, a2, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: slli a4, a2, 61 ; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a4, a4, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: slli a4, a2, 59 +; RV64-NEXT: srli a2, a2, 5 ; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 59 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 @@ -77,21 +77,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: andi a4, a2, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: slli a4, a2, 29 ; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a4, a4, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: slli a4, a2, 27 +; RV32-NEXT: srli a2, a2, 5 ; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 27 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -107,21 +107,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) ; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: andi a4, a2, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: slli a4, a2, 61 ; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a4, a4, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: slli a4, a2, 59 +; RV64-NEXT: srli a2, a2, 5 ; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 59 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 @@ -146,21 +146,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: andi a3, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: slli a3, a1, 29 ; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a3, a3, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: slli a3, a1, 27 +; RV32-NEXT: srli a1, a1, 5 ; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 27 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -176,21 +176,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: andi a3, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: slli a3, a1, 61 ; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a3, a3, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: slli a3, a1, 59 +; RV64-NEXT: srli a1, a1, 5 ; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 59 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 @@ -214,21 +214,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: andi a4, a2, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: slli a4, a2, 29 ; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: srli a4, a4, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: slli a4, a2, 27 +; RV32-NEXT: srli a2, a2, 5 ; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 27 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -244,21 +244,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) ; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: andi a4, a2, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: slli a4, a2, 61 ; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: srli a4, a4, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: slli a4, a2, 59 +; RV64-NEXT: srli a2, a2, 5 ; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 59 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 @@ -282,21 +282,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: andi a3, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: slli a3, a1, 29 ; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a3, a3, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: slli a3, a1, 27 +; RV32-NEXT: srli a1, a1, 5 ; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 27 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -312,21 +312,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: andi a3, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: slli a3, a1, 61 ; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a3, a3, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: slli a3, a1, 59 +; RV64-NEXT: srli a1, a1, 5 ; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 59 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 @@ -351,21 +351,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: andi a3, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: slli a3, a1, 29 ; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: srli a3, a3, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: slli a3, a1, 27 +; RV32-NEXT: srli a1, a1, 5 ; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 27 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 @@ -381,21 +381,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: andi a3, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 61 +; RV64-NEXT: slli a3, a1, 61 ; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: srli a3, a3, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: slli a3, a1, 59 +; RV64-NEXT: srli a1, a1, 5 ; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 59 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll index ddf98bab78141..8fad3db55f9bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -11,13 +11,13 @@ define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsra.vi v9, v9, 1 ; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsra.vi v9, v9, 1 ; CHECK-NEXT: vsra.vi v8, v8, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: vmin.vx v8, v8, a1, v0.t ; CHECK-NEXT: li a0, 192 ; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t ; CHECK-NEXT: ret @@ -386,12 +386,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a3, a1, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: addi a0, a1, -128 -; CHECK-NEXT: sltu a3, a1, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: sltu a0, a1, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a3, a0, a3 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll index 0728bcf0fda58..ca35aa6c4a94c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -381,12 +381,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a3, a1, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: addi a0, a1, -128 -; CHECK-NEXT: sltu a3, a1, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: sltu a0, a1, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a3, a0, a3 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index d9028b293e60b..e6dfe5e78cdb4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -19,10 +19,10 @@ define @vp_floor_nxv1bf16( %va, @vp_floor_nxv1bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -69,10 +69,10 @@ define @vp_floor_nxv2bf16( %va, @vp_floor_nxv2bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -120,10 +120,10 @@ define @vp_floor_nxv4bf16( %va, @vp_floor_nxv4bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -172,10 +172,10 @@ define @vp_floor_nxv8bf16( %va, @vp_floor_nxv8bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -224,10 +224,10 @@ define @vp_floor_nxv16bf16( %va, @vp_floor_nxv16bf16_unmasked( @vp_floor_nxv32bf16( %va, @vp_floor_nxv32bf16( %va, @vp_floor_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 2 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -410,12 +392,6 @@ define @vp_floor_nxv32bf16_unmasked( @llvm.vp.floor.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -444,10 +420,10 @@ define @vp_floor_nxv1f16( %va, @vp_floor_nxv1f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -526,10 +502,10 @@ define @vp_floor_nxv2f16( %va, @vp_floor_nxv2f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -609,10 +585,10 @@ define @vp_floor_nxv4f16( %va, @vp_floor_nxv4f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -673,9 +649,9 @@ declare @llvm.vp.floor.nxv8f16(, @vp_floor_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -695,10 +671,10 @@ define @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -759,9 +735,9 @@ declare @llvm.vp.floor.nxv16f16(, @vp_floor_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -781,10 +757,10 @@ define @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16_unmasked( %va ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -845,9 +821,9 @@ declare @llvm.vp.floor.nxv32f16(, @vp_floor_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -870,62 +846,54 @@ define @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16_unmasked( %va ; ; ZVFHMIN-LABEL: vp_floor_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a2, 2 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a2 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -1016,12 +974,6 @@ define @vp_floor_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1290,9 +1242,9 @@ declare @llvm.vp.floor.nxv2f64(, @vp_floor_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1334,9 +1286,9 @@ declare @llvm.vp.floor.nxv4f64(, @vp_floor_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1378,9 +1330,9 @@ declare @llvm.vp.floor.nxv7f64(, @vp_floor_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1422,9 +1374,9 @@ declare @llvm.vp.floor.nxv8f64(, @vp_floor_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1475,12 +1427,12 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64_unmasked( ; CHECK-LABEL: vp_floor_nxv16f64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI45_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI45_0) +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index c2c2beda94a0b..734dd5e33c4fc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -144,55 +144,155 @@ define @vfmax_nxv16bf16_vv( %a, @llvm.maximum.nxv32bf16(, ) define @vfmax_nxv32bf16_vv( %a, %b) nounwind { -; CHECK-LABEL: vfmax_nxv32bf16_vv: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v0, v8 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v3, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v3 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; ZVFH-LABEL: vfmax_nxv32bf16_vv: +; ZVFH: # %bb.0: +; ZVFH-NEXT: addi sp, sp, -16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: sub sp, sp, a0 +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v0, v8 +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0 +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 +; ZVFH-NEXT: vmfeq.vv v3, v16, v16 +; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv1r.v v0, v3 +; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmax.vv v16, v0, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 +; ZVFH-NEXT: vmfeq.vv v7, v24, v24 +; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0 +; ZVFH-NEXT: vmv1r.v v0, v7 +; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0 +; ZVFH-NEXT: vfmax.vv v16, v8, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add sp, sp, a0 +; ZVFH-NEXT: addi sp, sp, 16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 +; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv1r.v v0, v3 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmax.vv v16, v0, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 +; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0 +; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0 +; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %v = call @llvm.maximum.nxv32bf16( %a, %b) ret %v } @@ -395,40 +495,62 @@ define @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmax_nxv1f16_vv_nnana( %a, @vfmax_nxv1f16_vv_nnanb( %a, @vfmax_vv_nxv1bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v11, v11 ; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -93,11 +91,9 @@ define @vfmax_vv_nxv2bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v11, v11 ; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 ; CHECK-NEXT: vmv.v.v v0, v8 @@ -143,11 +139,9 @@ define @vfmax_vv_nxv4bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v12, v12 ; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -193,11 +187,9 @@ define @vfmax_vv_nxv8bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 ; CHECK-NEXT: vmfeq.vv v8, v16, v16 ; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -264,11 +256,9 @@ define @vfmax_vv_nxv16bf16_unmasked( @vfmax_vv_nxv32bf16( %va, @vfmax_vv_nxv32bf16( %va, @vfmax_vv_nxv32bf16_unmasked( @vfmax_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -671,11 +685,9 @@ define @vfmax_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 @@ -745,11 +757,9 @@ define @vfmax_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -821,11 +831,9 @@ define @vfmax_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 ; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -918,11 +926,9 @@ define @vfmax_vv_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 ; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 @@ -986,58 +992,64 @@ define @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: li a3, 24 +; ZVFHMIN-NEXT: mul a1, a1, a3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v13, v24, v24, v0.t +; ZVFHMIN-NEXT: vmv8r.v v0, v16 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: vmv1r.v v0, v13 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: li a4, 24 +; ZVFHMIN-NEXT: mul a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v24, v24, v16, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t -; ZVFHMIN-NEXT: vmv4r.v v8, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vmfeq.vv v13, v16, v16, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v13 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: li a3, 24 +; ZVFHMIN-NEXT: mul a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t -; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v3 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv1r.v v0, v7 ; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16 -; ZVFHMIN-NEXT: vmv8r.v v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -1545,99 +1577,55 @@ define @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: sub a4, a2, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) -; CHECK-NEXT: sub a3, a2, a1 -; CHECK-NEXT: sltu a4, a2, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: sltu a3, a2, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll index 928171e1f9528..21251ee2f3c63 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll @@ -144,55 +144,155 @@ define @vfmin_nxv16bf16_vv( %a, @llvm.minimum.nxv32bf16(, ) define @vfmin_nxv32bf16_vv( %a, %b) nounwind { -; CHECK-LABEL: vfmin_nxv32bf16_vv: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v0, v8 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v3, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v3 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; ZVFH-LABEL: vfmin_nxv32bf16_vv: +; ZVFH: # %bb.0: +; ZVFH-NEXT: addi sp, sp, -16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: sub sp, sp, a0 +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v0, v8 +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0 +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 +; ZVFH-NEXT: vmfeq.vv v3, v16, v16 +; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv1r.v v0, v3 +; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmin.vv v16, v0, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 +; ZVFH-NEXT: vmfeq.vv v7, v24, v24 +; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0 +; ZVFH-NEXT: vmv1r.v v0, v7 +; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0 +; ZVFH-NEXT: vfmin.vv v16, v8, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add sp, sp, a0 +; ZVFH-NEXT: addi sp, sp, 16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 +; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv1r.v v0, v3 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmin.vv v16, v0, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 +; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0 +; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0 +; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %v = call @llvm.minimum.nxv32bf16( %a, %b) ret %v } @@ -395,40 +495,62 @@ define @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv1f16_vv_nnana( %a, @vfmin_nxv1f16_vv_nnanb( %a, @vfmin_vv_nxv1bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v11, v11 ; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -93,11 +91,9 @@ define @vfmin_vv_nxv2bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v11, v11 ; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 ; CHECK-NEXT: vmv.v.v v0, v8 @@ -143,11 +139,9 @@ define @vfmin_vv_nxv4bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 ; CHECK-NEXT: vmfeq.vv v8, v12, v12 ; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -193,11 +187,9 @@ define @vfmin_vv_nxv8bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 ; CHECK-NEXT: vmfeq.vv v8, v16, v16 ; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -264,11 +256,9 @@ define @vfmin_vv_nxv16bf16_unmasked( @vfmin_vv_nxv32bf16( %va, @vfmin_vv_nxv32bf16( %va, @vfmin_vv_nxv32bf16_unmasked( @vfmin_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -671,11 +685,9 @@ define @vfmin_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 @@ -745,11 +757,9 @@ define @vfmin_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -821,11 +831,9 @@ define @vfmin_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 ; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 @@ -918,11 +926,9 @@ define @vfmin_vv_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 ; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 @@ -986,58 +992,64 @@ define @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: li a3, 24 +; ZVFHMIN-NEXT: mul a1, a1, a3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v13, v24, v24, v0.t +; ZVFHMIN-NEXT: vmv8r.v v0, v16 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: vmv1r.v v0, v13 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: li a4, 24 +; ZVFHMIN-NEXT: mul a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v24, v24, v16, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t -; ZVFHMIN-NEXT: vmv4r.v v8, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vmfeq.vv v13, v16, v16, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v13 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: li a3, 24 +; ZVFHMIN-NEXT: mul a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t -; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24, v0.t +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 +; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 +; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v3 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv1r.v v0, v7 ; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16 -; ZVFHMIN-NEXT: vmv8r.v v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -1545,99 +1577,55 @@ define @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: sub a4, a2, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) -; CHECK-NEXT: sub a3, a2, a1 -; CHECK-NEXT: sltu a4, a2, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: sltu a3, a2, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll index f22cd77db7a40..7a4695d1c25c1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll @@ -155,10 +155,10 @@ define @nearbyint_nxv1f32( %v) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -179,10 +179,10 @@ define @nearbyint_nxv2f32( %v) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -203,10 +203,10 @@ define @nearbyint_nxv4f32( %v) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -227,10 +227,10 @@ define @nearbyint_nxv8f32( %v) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -251,10 +251,10 @@ define @nearbyint_nxv16f32( %v) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 7b3a2ef172790..807a3e460b153 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -17,9 +17,9 @@ define @nearbyint_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a0 @@ -40,9 +40,9 @@ define @nearbyint_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a0 @@ -63,9 +63,9 @@ define @nearbyint_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a0 @@ -86,9 +86,9 @@ define @nearbyint_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a0 @@ -109,9 +109,9 @@ define @nearbyint_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: frflags a0 @@ -138,23 +138,23 @@ define @nearbyint_nxv32bf16( %x) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vfabs.v v8, v16 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v7, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -199,9 +199,9 @@ define @nearbyint_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 @@ -238,9 +238,9 @@ define @nearbyint_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 @@ -277,9 +277,9 @@ define @nearbyint_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 @@ -316,9 +316,9 @@ define @nearbyint_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 @@ -355,9 +355,9 @@ define @nearbyint_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 @@ -400,23 +400,23 @@ define @nearbyint_nxv32f16( %x) { ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 -; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: frflags a0 -; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t -; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t -; ZVFHMIN-NEXT: fsflags a0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vfabs.v v8, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v7, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vmv1r.v v0, v7 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll index 5bc1ab9820d6c..2fda344690bfc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll @@ -282,10 +282,10 @@ define float @reduce_fadd4(float %x, float %y, <4 x float> %v, <4 x float> %w) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.s.f v10, fa1 +; CHECK-NEXT: vfredusum.vs v9, v9, v10 ; CHECK-NEXT: vfmv.f.s fa5, v8 -; CHECK-NEXT: vfmv.s.f v8, fa1 -; CHECK-NEXT: vfredusum.vs v8, v9, v8 -; CHECK-NEXT: vfmv.f.s fa4, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 ; CHECK-NEXT: fdiv.s fa0, fa5, fa4 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll index 4aa26d6b79ca4..025874a1a74e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll @@ -7,10 +7,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; RV32-LABEL: test: ; RV32: # %bb.0: # %entry -; RV32-NEXT: th.lbib a3, (a1), -1, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: addi a3, a2, 1 +; RV32-NEXT: th.lbib a4, (a1), -1, 0 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vmv.s.x v9, zero ; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma ; RV32-NEXT: vslideup.vx v8, v9, a2 @@ -33,10 +33,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; ; RV64-LABEL: test: ; RV64: # %bb.0: # %entry -; RV64-NEXT: th.lbib a3, (a1), -1, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.x v8, a3 ; RV64-NEXT: addi a3, a2, 1 +; RV64-NEXT: th.lbib a4, (a1), -1, 0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a4 ; RV64-NEXT: vmv.s.x v9, zero ; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma ; RV64-NEXT: vslideup.vx v8, v9, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 3cb1be1b7e71a..f6598606b09f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -104,8 +104,8 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NOV-NEXT: sgtz a2, a1 ; CHECK-NOV-NEXT: sgtz a3, a0 ; CHECK-NOV-NEXT: neg a3, a3 -; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: neg a2, a2 +; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ret ; @@ -275,12 +275,12 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 ; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: and a2, a6, a2 ; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a5, t0, a5 +; CHECK-NOV-NEXT: and a3, a7, a3 +; CHECK-NOV-NEXT: and a2, a6, a2 ; CHECK-NOV-NEXT: and a1, a4, a1 ; CHECK-NOV-NEXT: sw a5, 0(a0) ; CHECK-NOV-NEXT: sw a3, 4(a0) @@ -756,12 +756,12 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: sgtz a5, a1 ; CHECK-NOV-NEXT: sgtz a6, a3 ; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: and a3, a6, a3 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, s1 ; CHECK-NOV-NEXT: negw a2, a2 +; CHECK-NOV-NEXT: and a3, a6, a3 +; CHECK-NOV-NEXT: and a1, a5, a1 +; CHECK-NOV-NEXT: and a4, a4, s1 ; CHECK-NOV-NEXT: and a0, a2, a0 ; CHECK-NOV-NEXT: sw a3, 0(s0) ; CHECK-NOV-NEXT: sw a1, 4(s0) @@ -992,8 +992,8 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-NOV-NEXT: sgtz a2, a1 ; CHECK-NOV-NEXT: sgtz a3, a0 ; CHECK-NOV-NEXT: neg a3, a3 -; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: neg a2, a2 +; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ret ; @@ -1167,12 +1167,12 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 ; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: and a2, a6, a2 ; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a5, t0, a5 +; CHECK-NOV-NEXT: and a3, a7, a3 +; CHECK-NOV-NEXT: and a2, a6, a2 ; CHECK-NOV-NEXT: and a1, a4, a1 ; CHECK-NOV-NEXT: sh a5, 0(a0) ; CHECK-NOV-NEXT: sh a3, 2(a0) @@ -2006,56 +2006,56 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NOV-NEXT: lui a3, 16 -; CHECK-NOV-NEXT: addiw a3, a3, -1 -; CHECK-NOV-NEXT: bge a0, a3, .LBB17_10 +; CHECK-NOV-NEXT: lui a4, 16 +; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: bge a0, a4, .LBB17_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: bge s1, a3, .LBB17_11 +; CHECK-NOV-NEXT: bge s1, a4, .LBB17_11 ; CHECK-NOV-NEXT: .LBB17_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: bge a1, a3, .LBB17_12 +; CHECK-NOV-NEXT: bge a1, a4, .LBB17_12 ; CHECK-NOV-NEXT: .LBB17_3: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz -; CHECK-NOV-NEXT: bge a2, a3, .LBB17_13 +; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: bge a2, a4, .LBB17_13 ; CHECK-NOV-NEXT: .LBB17_4: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: bge a4, a3, .LBB17_14 +; CHECK-NOV-NEXT: bge a3, a4, .LBB17_14 ; CHECK-NOV-NEXT: .LBB17_5: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: bge a5, a3, .LBB17_15 +; CHECK-NOV-NEXT: bge a5, a4, .LBB17_15 ; CHECK-NOV-NEXT: .LBB17_6: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: bge a6, a3, .LBB17_16 +; CHECK-NOV-NEXT: bge a6, a4, .LBB17_16 ; CHECK-NOV-NEXT: .LBB17_7: # %entry -; CHECK-NOV-NEXT: blt a7, a3, .LBB17_9 +; CHECK-NOV-NEXT: blt a7, a4, .LBB17_9 ; CHECK-NOV-NEXT: .LBB17_8: # %entry -; CHECK-NOV-NEXT: mv a7, a3 +; CHECK-NOV-NEXT: mv a7, a4 ; CHECK-NOV-NEXT: .LBB17_9: # %entry -; CHECK-NOV-NEXT: sgtz a3, a0 +; CHECK-NOV-NEXT: sgtz a4, a0 ; CHECK-NOV-NEXT: sgtz t0, s1 ; CHECK-NOV-NEXT: sgtz t1, a1 ; CHECK-NOV-NEXT: sgtz t2, a2 -; CHECK-NOV-NEXT: sgtz t3, a4 +; CHECK-NOV-NEXT: sgtz t3, a3 ; CHECK-NOV-NEXT: sgtz t4, a5 ; CHECK-NOV-NEXT: sgtz t5, a6 ; CHECK-NOV-NEXT: sgtz t6, a7 ; CHECK-NOV-NEXT: negw t6, t6 -; CHECK-NOV-NEXT: and a7, t6, a7 ; CHECK-NOV-NEXT: negw t5, t5 -; CHECK-NOV-NEXT: and a6, t5, a6 ; CHECK-NOV-NEXT: negw t4, t4 -; CHECK-NOV-NEXT: and a5, t4, a5 ; CHECK-NOV-NEXT: negw t3, t3 -; CHECK-NOV-NEXT: and a4, t3, a4 ; CHECK-NOV-NEXT: negw t2, t2 -; CHECK-NOV-NEXT: and a2, t2, a2 ; CHECK-NOV-NEXT: negw t1, t1 -; CHECK-NOV-NEXT: and a1, t1, a1 ; CHECK-NOV-NEXT: negw t0, t0 +; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a7, t6, a7 +; CHECK-NOV-NEXT: and a6, t5, a6 +; CHECK-NOV-NEXT: and a5, t4, a5 +; CHECK-NOV-NEXT: and a3, t3, a3 +; CHECK-NOV-NEXT: and a2, t2, a2 +; CHECK-NOV-NEXT: and a1, t1, a1 ; CHECK-NOV-NEXT: and t0, t0, s1 -; CHECK-NOV-NEXT: negw a3, a3 -; CHECK-NOV-NEXT: and a0, a3, a0 +; CHECK-NOV-NEXT: and a0, a4, a0 ; CHECK-NOV-NEXT: sh a2, 8(s0) ; CHECK-NOV-NEXT: sh a1, 10(s0) ; CHECK-NOV-NEXT: sh t0, 12(s0) @@ -2063,7 +2063,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: sh a7, 0(s0) ; CHECK-NOV-NEXT: sh a6, 2(s0) ; CHECK-NOV-NEXT: sh a5, 4(s0) -; CHECK-NOV-NEXT: sh a4, 6(s0) +; CHECK-NOV-NEXT: sh a3, 6(s0) ; CHECK-NOV-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 112(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 104(sp) # 8-byte Folded Reload @@ -2101,32 +2101,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB17_10: # %entry ; CHECK-NOV-NEXT: .cfi_restore_state -; CHECK-NOV-NEXT: mv a0, a3 +; CHECK-NOV-NEXT: mv a0, a4 ; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NOV-NEXT: blt s1, a3, .LBB17_2 +; CHECK-NOV-NEXT: blt s1, a4, .LBB17_2 ; CHECK-NOV-NEXT: .LBB17_11: # %entry -; CHECK-NOV-NEXT: mv s1, a3 +; CHECK-NOV-NEXT: mv s1, a4 ; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NOV-NEXT: blt a1, a3, .LBB17_3 +; CHECK-NOV-NEXT: blt a1, a4, .LBB17_3 ; CHECK-NOV-NEXT: .LBB17_12: # %entry -; CHECK-NOV-NEXT: mv a1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a4, fs3, rtz -; CHECK-NOV-NEXT: blt a2, a3, .LBB17_4 +; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz +; CHECK-NOV-NEXT: blt a2, a4, .LBB17_4 ; CHECK-NOV-NEXT: .LBB17_13: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a2, a4 ; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz -; CHECK-NOV-NEXT: blt a4, a3, .LBB17_5 +; CHECK-NOV-NEXT: blt a3, a4, .LBB17_5 ; CHECK-NOV-NEXT: .LBB17_14: # %entry -; CHECK-NOV-NEXT: mv a4, a3 +; CHECK-NOV-NEXT: mv a3, a4 ; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz -; CHECK-NOV-NEXT: blt a5, a3, .LBB17_6 +; CHECK-NOV-NEXT: blt a5, a4, .LBB17_6 ; CHECK-NOV-NEXT: .LBB17_15: # %entry -; CHECK-NOV-NEXT: mv a5, a3 +; CHECK-NOV-NEXT: mv a5, a4 ; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz -; CHECK-NOV-NEXT: blt a6, a3, .LBB17_7 +; CHECK-NOV-NEXT: blt a6, a4, .LBB17_7 ; CHECK-NOV-NEXT: .LBB17_16: # %entry -; CHECK-NOV-NEXT: mv a6, a3 -; CHECK-NOV-NEXT: bge a7, a3, .LBB17_8 +; CHECK-NOV-NEXT: mv a6, a4 +; CHECK-NOV-NEXT: bge a7, a4, .LBB17_8 ; CHECK-NOV-NEXT: j .LBB17_9 ; ; CHECK-V-LABEL: ustest_f16i16: @@ -2510,8 +2510,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: and a1, a1, a0 ; CHECK-NOV-NEXT: mv a0, a2 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -2552,8 +2552,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: snez a1, a1 ; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a2, a2, -1 -; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 @@ -2614,23 +2614,23 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-NOV-NEXT: and a3, a3, a0 ; CHECK-NOV-NEXT: beqz a1, .LBB20_7 ; CHECK-NOV-NEXT: # %bb.5: # %entry -; CHECK-NOV-NEXT: sgtz a1, a1 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: sgtz a0, a1 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: bnez a2, .LBB20_8 ; CHECK-NOV-NEXT: .LBB20_6: -; CHECK-NOV-NEXT: snez a0, a4 +; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: j .LBB20_9 ; CHECK-NOV-NEXT: .LBB20_7: -; CHECK-NOV-NEXT: snez a1, a3 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: snez a0, a3 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: beqz a2, .LBB20_6 ; CHECK-NOV-NEXT: .LBB20_8: # %entry -; CHECK-NOV-NEXT: sgtz a0, a2 +; CHECK-NOV-NEXT: sgtz a2, a2 ; CHECK-NOV-NEXT: .LBB20_9: # %entry -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, a4 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a3 +; CHECK-NOV-NEXT: neg a2, a2 +; CHECK-NOV-NEXT: neg a4, a0 +; CHECK-NOV-NEXT: and a0, a2, a1 +; CHECK-NOV-NEXT: and a1, a4, a3 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -2696,8 +2696,8 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: sgtz a2, a2 ; CHECK-V-NEXT: .LBB20_9: # %entry ; CHECK-V-NEXT: neg a2, a2 -; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 @@ -2932,8 +2932,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: and a1, a1, a0 ; CHECK-NOV-NEXT: mv a0, a2 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -2974,8 +2974,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: snez a1, a1 ; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a2, a2, -1 -; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 @@ -3036,23 +3036,23 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) { ; CHECK-NOV-NEXT: and a3, a3, a0 ; CHECK-NOV-NEXT: beqz a1, .LBB23_7 ; CHECK-NOV-NEXT: # %bb.5: # %entry -; CHECK-NOV-NEXT: sgtz a1, a1 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: sgtz a0, a1 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: bnez a2, .LBB23_8 ; CHECK-NOV-NEXT: .LBB23_6: -; CHECK-NOV-NEXT: snez a0, a4 +; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: j .LBB23_9 ; CHECK-NOV-NEXT: .LBB23_7: -; CHECK-NOV-NEXT: snez a1, a3 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: snez a0, a3 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: beqz a2, .LBB23_6 ; CHECK-NOV-NEXT: .LBB23_8: # %entry -; CHECK-NOV-NEXT: sgtz a0, a2 +; CHECK-NOV-NEXT: sgtz a2, a2 ; CHECK-NOV-NEXT: .LBB23_9: # %entry -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, a4 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a3 +; CHECK-NOV-NEXT: neg a2, a2 +; CHECK-NOV-NEXT: neg a4, a0 +; CHECK-NOV-NEXT: and a0, a2, a1 +; CHECK-NOV-NEXT: and a1, a4, a3 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3118,8 +3118,8 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: sgtz a2, a2 ; CHECK-V-NEXT: .LBB23_9: # %entry ; CHECK-V-NEXT: neg a2, a2 -; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 @@ -3357,8 +3357,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s2 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a2, a2, s1 ; CHECK-NOV-NEXT: and a1, a1, a0 ; CHECK-NOV-NEXT: mv a0, a2 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -3397,8 +3397,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-V-NEXT: snez a1, a1 ; CHECK-V-NEXT: snez a2, s2 ; CHECK-V-NEXT: addi a2, a2, -1 -; CHECK-V-NEXT: and a2, a2, s1 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: and a2, a2, s1 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v9, a0 @@ -3461,23 +3461,23 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) { ; CHECK-NOV-NEXT: and a3, a3, a0 ; CHECK-NOV-NEXT: beqz a1, .LBB26_7 ; CHECK-NOV-NEXT: # %bb.5: # %entry -; CHECK-NOV-NEXT: sgtz a1, a1 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: sgtz a0, a1 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: bnez a2, .LBB26_8 ; CHECK-NOV-NEXT: .LBB26_6: -; CHECK-NOV-NEXT: snez a0, a4 +; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: j .LBB26_9 ; CHECK-NOV-NEXT: .LBB26_7: -; CHECK-NOV-NEXT: snez a1, a3 -; CHECK-NOV-NEXT: and a4, a4, s0 +; CHECK-NOV-NEXT: snez a0, a3 +; CHECK-NOV-NEXT: and a1, a4, s0 ; CHECK-NOV-NEXT: beqz a2, .LBB26_6 ; CHECK-NOV-NEXT: .LBB26_8: # %entry -; CHECK-NOV-NEXT: sgtz a0, a2 +; CHECK-NOV-NEXT: sgtz a2, a2 ; CHECK-NOV-NEXT: .LBB26_9: # %entry -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, a4 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a3 +; CHECK-NOV-NEXT: neg a2, a2 +; CHECK-NOV-NEXT: neg a4, a0 +; CHECK-NOV-NEXT: and a0, a2, a1 +; CHECK-NOV-NEXT: and a1, a4, a3 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3541,8 +3541,8 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) { ; CHECK-V-NEXT: sgtz a2, a2 ; CHECK-V-NEXT: .LBB26_9: # %entry ; CHECK-V-NEXT: neg a2, a2 -; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: and a2, a2, a3 ; CHECK-V-NEXT: and a0, a1, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v9, a0 @@ -6050,11 +6050,11 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-NOV-NEXT: fmv.d fa0, fs0 ; CHECK-NOV-NEXT: call __fixunsdfti ; CHECK-NOV-NEXT: snez a1, a1 +; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: addi a2, a2, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: snez a1, s1 -; CHECK-NOV-NEXT: addi a1, a1, -1 -; CHECK-NOV-NEXT: and a1, a1, s0 +; CHECK-NOV-NEXT: and a1, a2, s0 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -6094,13 +6094,13 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: vfmv.f.s fa0, v8 ; CHECK-V-NEXT: call __fixunsdfti ; CHECK-V-NEXT: snez a1, a1 +; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: addi a2, a2, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: snez a1, s1 -; CHECK-V-NEXT: addi a1, a1, -1 -; CHECK-V-NEXT: and a1, a1, s0 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a1 +; CHECK-V-NEXT: vmv.s.x v8, a2 ; CHECK-V-NEXT: vmv.s.x v9, a0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb @@ -6152,16 +6152,16 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-NOV-NEXT: li a3, 1 ; CHECK-NOV-NEXT: .LBB47_4: # %entry ; CHECK-NOV-NEXT: slti a1, a1, 1 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: slti a0, s1, 1 -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, s0 +; CHECK-NOV-NEXT: slti a4, s1, 1 ; CHECK-NOV-NEXT: slti a3, a3, 0 -; CHECK-NOV-NEXT: addi a3, a3, -1 -; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: slti a2, a2, 0 +; CHECK-NOV-NEXT: neg a1, a1 +; CHECK-NOV-NEXT: neg a4, a4 +; CHECK-NOV-NEXT: addi a3, a3, -1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a1, a1, a0 +; CHECK-NOV-NEXT: and a0, a4, s0 +; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -6209,20 +6209,20 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: li a3, 1 ; CHECK-V-NEXT: .LBB47_4: # %entry ; CHECK-V-NEXT: slti a1, a1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: slti a1, s1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a1, a1, s0 +; CHECK-V-NEXT: slti a4, s1, 1 ; CHECK-V-NEXT: slti a3, a3, 0 -; CHECK-V-NEXT: addi a3, a3, -1 -; CHECK-V-NEXT: and a1, a3, a1 ; CHECK-V-NEXT: slti a2, a2, 0 +; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: neg a4, a4 +; CHECK-V-NEXT: addi a3, a3, -1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a0, a1, a0 +; CHECK-V-NEXT: and a4, a4, s0 +; CHECK-V-NEXT: and a3, a3, a4 ; CHECK-V-NEXT: and a0, a2, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: vmv.s.x v9, a1 +; CHECK-V-NEXT: vmv.s.x v9, a3 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add sp, sp, a0 @@ -6454,11 +6454,11 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-NOV-NEXT: fmv.s fa0, fs0 ; CHECK-NOV-NEXT: call __fixunssfti ; CHECK-NOV-NEXT: snez a1, a1 +; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: addi a2, a2, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: snez a1, s1 -; CHECK-NOV-NEXT: addi a1, a1, -1 -; CHECK-NOV-NEXT: and a1, a1, s0 +; CHECK-NOV-NEXT: and a1, a2, s0 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -6498,13 +6498,13 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: vfmv.f.s fa0, v8 ; CHECK-V-NEXT: call __fixunssfti ; CHECK-V-NEXT: snez a1, a1 +; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: addi a2, a2, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: snez a1, s1 -; CHECK-V-NEXT: addi a1, a1, -1 -; CHECK-V-NEXT: and a1, a1, s0 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a1 +; CHECK-V-NEXT: vmv.s.x v8, a2 ; CHECK-V-NEXT: vmv.s.x v9, a0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb @@ -6556,16 +6556,16 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-NOV-NEXT: li a3, 1 ; CHECK-NOV-NEXT: .LBB50_4: # %entry ; CHECK-NOV-NEXT: slti a1, a1, 1 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: slti a0, s1, 1 -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, s0 +; CHECK-NOV-NEXT: slti a4, s1, 1 ; CHECK-NOV-NEXT: slti a3, a3, 0 -; CHECK-NOV-NEXT: addi a3, a3, -1 -; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: slti a2, a2, 0 +; CHECK-NOV-NEXT: neg a1, a1 +; CHECK-NOV-NEXT: neg a4, a4 +; CHECK-NOV-NEXT: addi a3, a3, -1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a1, a1, a0 +; CHECK-NOV-NEXT: and a0, a4, s0 +; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -6613,20 +6613,20 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: li a3, 1 ; CHECK-V-NEXT: .LBB50_4: # %entry ; CHECK-V-NEXT: slti a1, a1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: slti a1, s1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a1, a1, s0 +; CHECK-V-NEXT: slti a4, s1, 1 ; CHECK-V-NEXT: slti a3, a3, 0 -; CHECK-V-NEXT: addi a3, a3, -1 -; CHECK-V-NEXT: and a1, a3, a1 ; CHECK-V-NEXT: slti a2, a2, 0 +; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: neg a4, a4 +; CHECK-V-NEXT: addi a3, a3, -1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a0, a1, a0 +; CHECK-V-NEXT: and a4, a4, s0 +; CHECK-V-NEXT: and a3, a3, a4 ; CHECK-V-NEXT: and a0, a2, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: vmv.s.x v9, a1 +; CHECK-V-NEXT: vmv.s.x v9, a3 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: add sp, sp, a0 @@ -6860,11 +6860,11 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: call __fixunssfti ; CHECK-NOV-NEXT: snez a1, a1 +; CHECK-NOV-NEXT: snez a2, s2 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: addi a2, a2, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: snez a1, s2 -; CHECK-NOV-NEXT: addi a1, a1, -1 -; CHECK-NOV-NEXT: and a1, a1, s1 +; CHECK-NOV-NEXT: and a1, a2, s1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -6899,13 +6899,13 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: call __fixunssfti ; CHECK-V-NEXT: snez a1, a1 +; CHECK-V-NEXT: snez a2, s2 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: addi a2, a2, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: snez a1, s2 -; CHECK-V-NEXT: addi a1, a1, -1 -; CHECK-V-NEXT: and a1, a1, s1 +; CHECK-V-NEXT: and a2, a2, s1 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v9, a1 +; CHECK-V-NEXT: vmv.s.x v9, a2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -6959,16 +6959,16 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-NOV-NEXT: li a3, 1 ; CHECK-NOV-NEXT: .LBB53_4: # %entry ; CHECK-NOV-NEXT: slti a1, a1, 1 -; CHECK-NOV-NEXT: neg a1, a1 -; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: slti a0, s1, 1 -; CHECK-NOV-NEXT: neg a0, a0 -; CHECK-NOV-NEXT: and a0, a0, s0 +; CHECK-NOV-NEXT: slti a4, s1, 1 ; CHECK-NOV-NEXT: slti a3, a3, 0 -; CHECK-NOV-NEXT: addi a3, a3, -1 -; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: slti a2, a2, 0 +; CHECK-NOV-NEXT: neg a1, a1 +; CHECK-NOV-NEXT: neg a4, a4 +; CHECK-NOV-NEXT: addi a3, a3, -1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a1, a1, a0 +; CHECK-NOV-NEXT: and a0, a4, s0 +; CHECK-NOV-NEXT: and a0, a3, a0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -7014,20 +7014,20 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-V-NEXT: li a3, 1 ; CHECK-V-NEXT: .LBB53_4: # %entry ; CHECK-V-NEXT: slti a1, a1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: slti a1, s1, 1 -; CHECK-V-NEXT: neg a1, a1 -; CHECK-V-NEXT: and a1, a1, s0 +; CHECK-V-NEXT: slti a4, s1, 1 ; CHECK-V-NEXT: slti a3, a3, 0 -; CHECK-V-NEXT: addi a3, a3, -1 -; CHECK-V-NEXT: and a1, a3, a1 ; CHECK-V-NEXT: slti a2, a2, 0 +; CHECK-V-NEXT: neg a1, a1 +; CHECK-V-NEXT: neg a4, a4 +; CHECK-V-NEXT: addi a3, a3, -1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a0, a1, a0 +; CHECK-V-NEXT: and a4, a4, s0 +; CHECK-V-NEXT: and a3, a3, a4 ; CHECK-V-NEXT: and a0, a2, a0 ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v9, a0 -; CHECK-V-NEXT: vmv.s.x v8, a1 +; CHECK-V-NEXT: vmv.s.x v8, a3 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll index 3fd37384ada9b..5fe59f3b3933d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll @@ -17,9 +17,9 @@ define @rint_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -38,9 +38,9 @@ define @rint_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -59,9 +59,9 @@ define @rint_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -80,9 +80,9 @@ define @rint_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -101,9 +101,9 @@ define @rint_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -122,9 +122,9 @@ define @rint_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t @@ -168,9 +168,9 @@ define @rint_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -203,9 +203,9 @@ define @rint_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -238,9 +238,9 @@ define @rint_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -273,9 +273,9 @@ define @rint_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -308,9 +308,9 @@ define @rint_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -343,9 +343,9 @@ define @rint_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll index 54f56eadf0034..195ffc50594c3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll @@ -459,12 +459,12 @@ define @test5( %0, ; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfadd.vv v8, v8, v9 +; CHECK-NEXT: lui a0, 66 ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: frrm a0 -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: lui a2, 66 -; CHECK-NEXT: addiw a2, a2, 769 -; CHECK-NEXT: srl a0, a2, a0 +; CHECK-NEXT: addiw a0, a0, 769 +; CHECK-NEXT: frrm a2 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: srl a0, a0, a2 ; CHECK-NEXT: andi a0, a0, 7 ; CHECK-NEXT: vfadd.vv v8, v8, v8 ; CHECK-NEXT: sw a0, 0(a1) @@ -475,12 +475,12 @@ define @test5( %0, ; UNOPT-NEXT: fsrmi a2, 0 ; UNOPT-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; UNOPT-NEXT: vfadd.vv v8, v8, v9 +; UNOPT-NEXT: lui a0, 66 ; UNOPT-NEXT: fsrm a2 -; UNOPT-NEXT: frrm a0 -; UNOPT-NEXT: slli a0, a0, 2 -; UNOPT-NEXT: lui a2, 66 -; UNOPT-NEXT: addiw a2, a2, 769 -; UNOPT-NEXT: srl a0, a2, a0 +; UNOPT-NEXT: addiw a0, a0, 769 +; UNOPT-NEXT: frrm a2 +; UNOPT-NEXT: slli a2, a2, 2 +; UNOPT-NEXT: srl a0, a0, a2 ; UNOPT-NEXT: andi a0, a0, 7 ; UNOPT-NEXT: vfadd.vv v8, v8, v8 ; UNOPT-NEXT: sw a0, 0(a1) @@ -588,8 +588,8 @@ define @after_fsrm4( %0, @after_fsrm4( %0, @round_nxv1f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -172,10 +172,10 @@ define @round_nxv2f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -195,10 +195,10 @@ define @round_nxv4f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -218,10 +218,10 @@ define @round_nxv8f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -241,10 +241,10 @@ define @round_nxv16f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index fd834e9eb5275..f7422b279149f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -19,9 +19,9 @@ define @round_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -42,9 +42,9 @@ define @round_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -65,9 +65,9 @@ define @round_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -88,9 +88,9 @@ define @round_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -111,9 +111,9 @@ define @round_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -134,9 +134,9 @@ define @round_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -146,19 +146,21 @@ define @round_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %a = call @llvm.round.nxv32bf16( %x) ret %a @@ -184,9 +186,9 @@ define @round_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -223,9 +225,9 @@ define @round_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -262,9 +264,9 @@ define @round_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -301,9 +303,9 @@ define @round_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -340,9 +342,9 @@ define @round_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -379,9 +381,9 @@ define @round_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 @@ -391,19 +393,21 @@ define @round_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: fsrmi a0, 4 -; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv32f16( %x) ret %a diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll index 051939d988f85..c293ac91b63bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll @@ -149,10 +149,10 @@ define @roundeven_nxv1f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -172,10 +172,10 @@ define @roundeven_nxv2f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -195,10 +195,10 @@ define @roundeven_nxv4f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma @@ -218,10 +218,10 @@ define @roundeven_nxv8f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma @@ -241,10 +241,10 @@ define @roundeven_nxv16f32( %x) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 8514658824678..865531b77eb29 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -18,9 +18,9 @@ define @roundeven_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -41,9 +41,9 @@ define @roundeven_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -64,9 +64,9 @@ define @roundeven_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -87,9 +87,9 @@ define @roundeven_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -110,9 +110,9 @@ define @roundeven_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -133,9 +133,9 @@ define @roundeven_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -145,19 +145,21 @@ define @roundeven_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %a = call @llvm.roundeven.nxv32bf16( %x) ret %a @@ -183,9 +185,9 @@ define @roundeven_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -222,9 +224,9 @@ define @roundeven_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -261,9 +263,9 @@ define @roundeven_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -300,9 +302,9 @@ define @roundeven_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -339,9 +341,9 @@ define @roundeven_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -378,9 +380,9 @@ define @roundeven_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 @@ -390,19 +392,21 @@ define @roundeven_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: fsrmi a0, 0 -; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv32f16( %x) ret %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index cf35e9c40b8a7..c7e3c8cb51982 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -216,15 +216,16 @@ define @fshr_v64i8( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8r.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vsll.vi v16, v8, 1, v0.t ; CHECK-NEXT: vnot.v v8, v24, v0.t ; CHECK-NEXT: vand.vi v8, v8, 7, v0.t ; CHECK-NEXT: vsll.vv v8, v16, v8, v0.t ; CHECK-NEXT: vand.vi v16, v24, 7, v0.t +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsrl.vv v16, v24, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t @@ -249,15 +250,16 @@ define @fshl_v64i8( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8r.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vsrl.vi v16, v16, 1, v0.t ; CHECK-NEXT: vnot.v v8, v24, v0.t ; CHECK-NEXT: vand.vi v8, v8, 7, v0.t ; CHECK-NEXT: vsrl.vv v8, v16, v8, v0.t ; CHECK-NEXT: vand.vi v16, v24, 7, v0.t +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsll.vv v16, v24, v16, v0.t ; CHECK-NEXT: vor.vv v8, v16, v8, v0.t @@ -452,15 +454,16 @@ define @fshr_v32i16( %a, @fshl_v32i16( %a, @fshr_v16i32( %a, @fshl_v16i32( %a, @fshr_v7i64( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t @@ -876,10 +880,10 @@ define @fshl_v7i64( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vl8re64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t @@ -912,9 +916,9 @@ define @fshr_v8i64( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t @@ -947,10 +951,10 @@ define @fshl_v8i64( %a, ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vl8re64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: li a0, 63 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t @@ -998,45 +1002,61 @@ define @fshr_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshr_v1i4( %a, %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t +; CHECK-NEXT: li a0, 4 ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: li a0, 4 ; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t ; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t @@ -1376,10 +1400,10 @@ define @fshl_v1i4( %a, %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v10, 15, v0.t +; CHECK-NEXT: li a0, 4 ; CHECK-NEXT: vand.vi v9, v9, 15, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: li a0, 4 ; CHECK-NEXT: vremu.vx v9, v10, a0, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v9, v0.t ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll index 8452848c467d3..eae21a76f3f00 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll @@ -7,12 +7,12 @@ define @fshr( %a, %b, @llvm.fshr.v4i32( %a, %b, %c) @@ -24,12 +24,12 @@ define @fshl( %a, %b, @llvm.fshl.v4i32( %a, %b, %c) diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll index d07bc2c6bf74d..8a5f118d8f6ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll @@ -135,10 +135,10 @@ define @trunc_nxv1f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -156,10 +156,10 @@ define @trunc_nxv2f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -177,10 +177,10 @@ define @trunc_nxv4f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t @@ -198,10 +198,10 @@ define @trunc_nxv8f32( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t @@ -219,10 +219,10 @@ define @trunc_nxv16f32( %x) strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll index 2b3c952679eac..d597e166be4ee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll @@ -17,9 +17,9 @@ define @trunc_nxv1bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t @@ -38,9 +38,9 @@ define @trunc_nxv2bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t @@ -59,9 +59,9 @@ define @trunc_nxv4bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t @@ -80,9 +80,9 @@ define @trunc_nxv8bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t @@ -101,9 +101,9 @@ define @trunc_nxv16bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t @@ -122,9 +122,9 @@ define @trunc_nxv32bf16( %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t @@ -168,9 +168,9 @@ define @trunc_nxv1f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t @@ -203,9 +203,9 @@ define @trunc_nxv2f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t @@ -238,9 +238,9 @@ define @trunc_nxv4f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t @@ -273,9 +273,9 @@ define @trunc_nxv8f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t @@ -308,9 +308,9 @@ define @trunc_nxv16f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t @@ -343,9 +343,9 @@ define @trunc_nxv32f16( %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 ; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll index e2298774a9b8d..8925a9e0cee32 100644 --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -419,9 +419,9 @@ define @insert_nxv4i1_nxv1i1_0( %v, @insert_nxv4i1_nxv1i1_2( %v, @insertelt_idx_nxv1i1( %x, i1 %elt, i6 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 @@ -59,8 +59,8 @@ define @insertelt_idx_nxv2i1( %x, i1 %elt, i6 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 @@ -94,8 +94,8 @@ define @insertelt_idx_nxv4i1( %x, i1 %elt, i6 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 @@ -129,8 +129,8 @@ define @insertelt_idx_nxv8i1( %x, i1 %elt, i6 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll index e9c78ed08f72a..7e2ec46339b33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll @@ -9,8 +9,8 @@ define void @interleave256(ptr %agg.result, ptr %0, ptr %1) { ; RV64-1024-NEXT: vsetvli zero, a3, e16, m2, ta, ma ; RV64-1024-NEXT: vle16.v v8, (a1) ; RV64-1024-NEXT: vle16.v v10, (a2) -; RV64-1024-NEXT: vwaddu.vv v12, v8, v10 ; RV64-1024-NEXT: li a1, -1 +; RV64-1024-NEXT: vwaddu.vv v12, v8, v10 ; RV64-1024-NEXT: vwmaccu.vx v12, a1, v10 ; RV64-1024-NEXT: li a1, 256 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, ma @@ -23,8 +23,8 @@ define void @interleave256(ptr %agg.result, ptr %0, ptr %1) { ; RV64-2048-NEXT: vsetvli zero, a3, e16, m1, ta, ma ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v9, (a2) -; RV64-2048-NEXT: vwaddu.vv v10, v8, v9 ; RV64-2048-NEXT: li a1, -1 +; RV64-2048-NEXT: vwaddu.vv v10, v8, v9 ; RV64-2048-NEXT: vwmaccu.vx v10, a1, v9 ; RV64-2048-NEXT: li a1, 256 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, ma @@ -47,8 +47,8 @@ define void @interleave512(ptr %agg.result, ptr %0, ptr %1) local_unnamed_addr { ; RV64-1024-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; RV64-1024-NEXT: vle16.v v8, (a1) ; RV64-1024-NEXT: vle16.v v12, (a2) -; RV64-1024-NEXT: vwaddu.vv v16, v8, v12 ; RV64-1024-NEXT: li a1, -1 +; RV64-1024-NEXT: vwaddu.vv v16, v8, v12 ; RV64-1024-NEXT: vwmaccu.vx v16, a1, v12 ; RV64-1024-NEXT: li a1, 512 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m8, ta, ma @@ -61,8 +61,8 @@ define void @interleave512(ptr %agg.result, ptr %0, ptr %1) local_unnamed_addr { ; RV64-2048-NEXT: vsetvli zero, a3, e16, m2, ta, ma ; RV64-2048-NEXT: vle16.v v8, (a1) ; RV64-2048-NEXT: vle16.v v10, (a2) -; RV64-2048-NEXT: vwaddu.vv v12, v8, v10 ; RV64-2048-NEXT: li a1, -1 +; RV64-2048-NEXT: vwaddu.vv v12, v8, v10 ; RV64-2048-NEXT: vwmaccu.vx v12, a1, v10 ; RV64-2048-NEXT: li a1, 512 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll index e34b4a81b631b..ffb9bf76fb4fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll @@ -58,12 +58,12 @@ define @llrint_nxv16i64_nxv16f32( %x, < ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.x.f.v v16, v12, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB4_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll index c9f91bf9def2c..9991bbc9725ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll @@ -120,12 +120,12 @@ define @lrint_nxv16f32( %x, @mgather_baseidx_nxv32i8(ptr %base, ; RV64-NEXT: vmv1r.v v16, v0 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t -; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a2, a1, 3 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll index 9ae470c789896..ac26a014aaa64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll @@ -12,8 +12,8 @@ define void @complex_gep(ptr %p, %vec.ind, ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32-NEXT: vnsrl.wi v10, v8, 0 ; RV32-NEXT: li a1, 48 -; RV32-NEXT: vmul.vx v8, v10, a1 ; RV32-NEXT: addi a0, a0, 28 +; RV32-NEXT: vmul.vx v8, v10, a1 ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsoxei32.v v9, (a0), v8, v0.t ; RV32-NEXT: ret @@ -21,9 +21,9 @@ define void @complex_gep(ptr %p, %vec.ind, ; RV64-LABEL: complex_gep: ; RV64: # %bb.0: ; RV64-NEXT: li a1, 56 +; RV64-NEXT: addi a0, a0, 32 ; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: addi a0, a0, 32 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsoxei64.v v10, (a0), v8, v0.t @@ -38,14 +38,14 @@ define void @strided_store_zero_start(i64 %n, ptr %p) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vid.v v8 +; RV32-NEXT: li a0, 48 +; RV32-NEXT: addi a1, a2, 32 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vnsrl.wi v8, v8, 0 -; RV32-NEXT: li a0, 48 ; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: addi a0, a2, 32 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vsoxei32.v v9, (a0), v8 +; RV32-NEXT: vsoxei32.v v9, (a1), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: strided_store_zero_start: @@ -68,25 +68,25 @@ define void @strided_store_offset_start(i64 %n, ptr %p) { ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-NEXT: vid.v v8 ; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: li a0, 48 +; RV32-NEXT: addi a1, a2, 32 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vnsrl.wi v8, v8, 0 -; RV32-NEXT: li a0, 48 ; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: addi a0, a2, 32 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vsoxei32.v v9, (a0), v8 +; RV32-NEXT: vsoxei32.v v9, (a1), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: strided_store_offset_start: ; RV64: # %bb.0: ; RV64-NEXT: slli a2, a0, 3 ; RV64-NEXT: slli a0, a0, 6 +; RV64-NEXT: vsetvli a3, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: sub a0, a0, a2 ; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: addi a0, a0, 36 -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsse64.v v8, (a0), a1 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll index c7c7dbafd630a..72c251ce985cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1894,36 +1894,57 @@ define void @mscatter_nxv16f64( %val0, %val0, %val0, %val1, ptr %base, %idxs, %m) { ; RV32-LABEL: mscatter_baseidx_nxv16i8_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: vl2r.v v6, (a1) -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf4 v24, v6 -; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vl2r.v v4, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v7, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vsext.vf4 v24, v4 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: vmv1r.v v0, v7 @@ -1961,12 +1982,12 @@ define void @mscatter_baseidx_nxv16i8_nxv16f64( %val0, %val0, %val0, %val1, ptr %base, %idxs, %m) { ; RV32-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: vl4re16.v v4, (a1) -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf2 v24, v4 -; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: vl4re16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v7, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vsext.vf2 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v12, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: @@ -2001,12 +2039,12 @@ define void @mscatter_baseidx_nxv16i16_nxv16f64( %val0, @reverse_nxv2i1( %a) { ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: srli a0, a0, 2 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -37,8 +37,8 @@ define @reverse_nxv2i1( %a) { ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: csrr a0, vlenb +; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: srli a0, a0, 2 ; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vid.v v9 @@ -51,8 +51,8 @@ define @reverse_nxv2i1( %a) { ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: csrr a0, vlenb +; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: srli a0, a0, 2 ; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vid.v v9 @@ -65,8 +65,8 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: srli a0, a0, 2 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -81,8 +81,8 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: csrr a0, vlenb +; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: srli a0, a0, 2 ; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vid.v v9 @@ -95,8 +95,8 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: csrr a0, vlenb +; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: srli a0, a0, 2 ; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vid.v v9 @@ -113,8 +113,8 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: srli a0, a0, 1 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -129,8 +129,8 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: csrr a0, vlenb +; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: srli a0, a0, 1 ; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vid.v v9 @@ -143,8 +143,8 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: csrr a0, vlenb +; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: srli a0, a0, 1 ; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vid.v v9 @@ -157,8 +157,8 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: srli a0, a0, 1 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -173,8 +173,8 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: csrr a0, vlenb +; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: srli a0, a0, 1 ; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vid.v v9 @@ -187,8 +187,8 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: csrr a0, vlenb +; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: srli a0, a0, 1 ; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vid.v v9 @@ -205,8 +205,8 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v10 @@ -220,8 +220,8 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: csrr a0, vlenb +; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vid.v v9 ; RV32-BITS-256-NEXT: vrsub.vx v9, v9, a0 @@ -233,8 +233,8 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: csrr a0, vlenb +; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vid.v v9 ; RV32-BITS-512-NEXT: vrsub.vx v9, v9, a0 @@ -246,8 +246,8 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v10 @@ -261,8 +261,8 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: csrr a0, vlenb +; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vid.v v9 ; RV64-BITS-256-NEXT: vrsub.vx v9, v9, a0 @@ -274,8 +274,8 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: csrr a0, vlenb +; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vid.v v9 ; RV64-BITS-512-NEXT: vrsub.vx v9, v9, a0 @@ -290,12 +290,14 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv16i1: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 ; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 @@ -307,12 +309,14 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-256-LABEL: reverse_nxv16i1: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-BITS-256-NEXT: vmv.v.i v10, 0 +; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrsub.vx v8, v8, a0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-256-NEXT: vmv.v.i v10, 0 ; RV32-BITS-256-NEXT: vmerge.vim v10, v10, 1, v0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrgather.vv v13, v10, v8 @@ -324,12 +328,14 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-512-LABEL: reverse_nxv16i1: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-BITS-512-NEXT: vmv.v.i v10, 0 +; RV32-BITS-512-NEXT: addi a0, a0, -1 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrsub.vx v8, v8, a0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-512-NEXT: vmv.v.i v10, 0 ; RV32-BITS-512-NEXT: vmerge.vim v10, v10, 1, v0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrgather.vv v13, v10, v8 @@ -341,12 +347,14 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i1: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 ; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 @@ -358,12 +366,14 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-256-LABEL: reverse_nxv16i1: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-BITS-256-NEXT: vmv.v.i v10, 0 +; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrsub.vx v8, v8, a0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-256-NEXT: vmv.v.i v10, 0 ; RV64-BITS-256-NEXT: vmerge.vim v10, v10, 1, v0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrgather.vv v13, v10, v8 @@ -375,12 +385,14 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-512-LABEL: reverse_nxv16i1: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-BITS-512-NEXT: vmv.v.i v10, 0 +; RV64-BITS-512-NEXT: addi a0, a0, -1 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrsub.vx v8, v8, a0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-512-NEXT: vmv.v.i v10, 0 ; RV64-BITS-512-NEXT: vmerge.vim v10, v10, 1, v0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrgather.vv v13, v10, v8 @@ -396,18 +408,20 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv32i1: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v8 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v8, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v12, 0 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v12, v12, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v16, v12 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v17, v12 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v12, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v13, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v14, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v15, v16 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret @@ -415,18 +429,20 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-256-LABEL: reverse_nxv32i1: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v8 -; RV32-BITS-256-NEXT: vrsub.vx v12, v8, a0 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-256-NEXT: vmv.v.i v12, 0 +; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrsub.vx v16, v8, a0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-256-NEXT: vmerge.vim v12, v12, 1, v0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV32-BITS-256-NEXT: vrgather.vv v11, v16, v12 -; RV32-BITS-256-NEXT: vrgather.vv v10, v17, v12 -; RV32-BITS-256-NEXT: vrgather.vv v9, v18, v12 -; RV32-BITS-256-NEXT: vrgather.vv v8, v19, v12 +; RV32-BITS-256-NEXT: vrgather.vv v11, v12, v16 +; RV32-BITS-256-NEXT: vrgather.vv v10, v13, v16 +; RV32-BITS-256-NEXT: vrgather.vv v9, v14, v16 +; RV32-BITS-256-NEXT: vrgather.vv v8, v15, v16 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret @@ -434,18 +450,20 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-512-LABEL: reverse_nxv32i1: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v8 -; RV32-BITS-512-NEXT: vrsub.vx v12, v8, a0 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-512-NEXT: vmv.v.i v12, 0 +; RV32-BITS-512-NEXT: addi a0, a0, -1 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrsub.vx v16, v8, a0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-512-NEXT: vmerge.vim v12, v12, 1, v0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV32-BITS-512-NEXT: vrgather.vv v11, v16, v12 -; RV32-BITS-512-NEXT: vrgather.vv v10, v17, v12 -; RV32-BITS-512-NEXT: vrgather.vv v9, v18, v12 -; RV32-BITS-512-NEXT: vrgather.vv v8, v19, v12 +; RV32-BITS-512-NEXT: vrgather.vv v11, v12, v16 +; RV32-BITS-512-NEXT: vrgather.vv v10, v13, v16 +; RV32-BITS-512-NEXT: vrgather.vv v9, v14, v16 +; RV32-BITS-512-NEXT: vrgather.vv v8, v15, v16 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret @@ -453,18 +471,20 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i1: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v8 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v8, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v12, 0 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v12, v12, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v16, v12 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v17, v12 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v12, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v13, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v14, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v15, v16 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret @@ -472,18 +492,20 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-256-LABEL: reverse_nxv32i1: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v8 -; RV64-BITS-256-NEXT: vrsub.vx v12, v8, a0 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-256-NEXT: vmv.v.i v12, 0 +; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrsub.vx v16, v8, a0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-256-NEXT: vmerge.vim v12, v12, 1, v0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV64-BITS-256-NEXT: vrgather.vv v11, v16, v12 -; RV64-BITS-256-NEXT: vrgather.vv v10, v17, v12 -; RV64-BITS-256-NEXT: vrgather.vv v9, v18, v12 -; RV64-BITS-256-NEXT: vrgather.vv v8, v19, v12 +; RV64-BITS-256-NEXT: vrgather.vv v11, v12, v16 +; RV64-BITS-256-NEXT: vrgather.vv v10, v13, v16 +; RV64-BITS-256-NEXT: vrgather.vv v9, v14, v16 +; RV64-BITS-256-NEXT: vrgather.vv v8, v15, v16 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret @@ -491,18 +513,20 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-512-LABEL: reverse_nxv32i1: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v8 -; RV64-BITS-512-NEXT: vrsub.vx v12, v8, a0 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-512-NEXT: vmv.v.i v12, 0 +; RV64-BITS-512-NEXT: addi a0, a0, -1 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrsub.vx v16, v8, a0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-512-NEXT: vmerge.vim v12, v12, 1, v0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV64-BITS-512-NEXT: vrgather.vv v11, v16, v12 -; RV64-BITS-512-NEXT: vrgather.vv v10, v17, v12 -; RV64-BITS-512-NEXT: vrgather.vv v9, v18, v12 -; RV64-BITS-512-NEXT: vrgather.vv v8, v19, v12 +; RV64-BITS-512-NEXT: vrgather.vv v11, v12, v16 +; RV64-BITS-512-NEXT: vrgather.vv v10, v13, v16 +; RV64-BITS-512-NEXT: vrgather.vv v9, v14, v16 +; RV64-BITS-512-NEXT: vrgather.vv v8, v15, v16 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret @@ -514,13 +538,15 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv64i1: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v24, 0 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v24, v8, 1, v0 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v24, v24, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v24, v16 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v25, v16 @@ -537,13 +563,15 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-256-LABEL: reverse_nxv64i1: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV32-BITS-256-NEXT: vmv.v.i v16, 0 +; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-256-NEXT: vmerge.vim v16, v16, 1, v0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-256-NEXT: vrgather.vv v14, v17, v24 @@ -560,13 +588,15 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-512-LABEL: reverse_nxv64i1: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV32-BITS-512-NEXT: vmv.v.i v16, 0 +; RV32-BITS-512-NEXT: addi a0, a0, -1 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-512-NEXT: vrgather.vv v14, v17, v24 @@ -583,13 +613,15 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv64i1: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v24, 0 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v24, v8, 1, v0 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v24, v24, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v24, v16 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v25, v16 @@ -606,13 +638,15 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-256-LABEL: reverse_nxv64i1: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV64-BITS-256-NEXT: vmv.v.i v16, 0 +; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-256-NEXT: vmerge.vim v16, v16, 1, v0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-256-NEXT: vrgather.vv v14, v17, v24 @@ -629,13 +663,15 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-512-LABEL: reverse_nxv64i1: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; RV64-BITS-512-NEXT: vmv.v.i v16, 0 +; RV64-BITS-512-NEXT: addi a0, a0, -1 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-512-NEXT: vrgather.vv v14, v17, v24 @@ -1929,10 +1965,10 @@ define @reverse_nxv6i64( %a) { ; CHECK-LABEL: reverse_nxv6i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v22, v16, a0 ; CHECK-NEXT: vrgather.vv v21, v10, v22 ; CHECK-NEXT: vrgather.vv v19, v12, v22 @@ -1967,27 +2003,27 @@ define @reverse_nxv12i64( %a) { ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vid.v v20 ; RV32-NEXT: srli a1, a0, 3 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; RV32-NEXT: vid.v v20 ; RV32-NEXT: vrsub.vx v20, v20, a1 ; RV32-NEXT: vrgather.vv v31, v12, v20 +; RV32-NEXT: vrgather.vv v7, v8, v20 ; RV32-NEXT: vrgather.vv v30, v13, v20 +; RV32-NEXT: vrgather.vv v6, v9, v20 ; RV32-NEXT: vrgather.vv v29, v14, v20 +; RV32-NEXT: vrgather.vv v5, v10, v20 ; RV32-NEXT: vrgather.vv v28, v15, v20 +; RV32-NEXT: vrgather.vv v4, v11, v20 +; RV32-NEXT: addi a1, sp, 64 +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: vrgather.vv v27, v16, v20 +; RV32-NEXT: vs4r.v v4, (a0) ; RV32-NEXT: vrgather.vv v26, v17, v20 ; RV32-NEXT: vrgather.vv v25, v18, v20 ; RV32-NEXT: vrgather.vv v24, v19, v20 -; RV32-NEXT: vrgather.vv v15, v8, v20 -; RV32-NEXT: vrgather.vv v14, v9, v20 -; RV32-NEXT: vrgather.vv v13, v10, v20 -; RV32-NEXT: vrgather.vv v12, v11, v20 -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: addi a1, sp, 64 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: vs4r.v v12, (a0) ; RV32-NEXT: vs8r.v v24, (a1) ; RV32-NEXT: vl8re64.v v16, (a0) ; RV32-NEXT: vl8re64.v v8, (a1) @@ -2016,27 +2052,27 @@ define @reverse_nxv12i64( %a) { ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64-NEXT: vid.v v20 ; RV64-NEXT: srli a1, a0, 3 ; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; RV64-NEXT: vid.v v20 ; RV64-NEXT: vrsub.vx v20, v20, a1 ; RV64-NEXT: vrgather.vv v31, v12, v20 +; RV64-NEXT: vrgather.vv v7, v8, v20 ; RV64-NEXT: vrgather.vv v30, v13, v20 +; RV64-NEXT: vrgather.vv v6, v9, v20 ; RV64-NEXT: vrgather.vv v29, v14, v20 +; RV64-NEXT: vrgather.vv v5, v10, v20 ; RV64-NEXT: vrgather.vv v28, v15, v20 +; RV64-NEXT: vrgather.vv v4, v11, v20 +; RV64-NEXT: addi a1, sp, 64 +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: vrgather.vv v27, v16, v20 +; RV64-NEXT: vs4r.v v4, (a0) ; RV64-NEXT: vrgather.vv v26, v17, v20 ; RV64-NEXT: vrgather.vv v25, v18, v20 ; RV64-NEXT: vrgather.vv v24, v19, v20 -; RV64-NEXT: vrgather.vv v15, v8, v20 -; RV64-NEXT: vrgather.vv v14, v9, v20 -; RV64-NEXT: vrgather.vv v13, v10, v20 -; RV64-NEXT: vrgather.vv v12, v11, v20 -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: addi a1, sp, 64 -; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: vs4r.v v12, (a0) ; RV64-NEXT: vs8r.v v24, (a1) ; RV64-NEXT: vl8re64.v v16, (a0) ; RV64-NEXT: vl8re64.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index b43655283b975..94fce80ad3b8e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -19,10 +19,10 @@ define @vp_nearbyint_nxv1bf16( %va, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -44,10 +44,10 @@ define @vp_nearbyint_nxv1bf16_unmasked( @vp_nearbyint_nxv2bf16( %va, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -94,10 +94,10 @@ define @vp_nearbyint_nxv2bf16_unmasked( @vp_nearbyint_nxv4bf16( %va, < ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -146,10 +146,10 @@ define @vp_nearbyint_nxv4bf16_unmasked( @vp_nearbyint_nxv8bf16( %va, < ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -198,10 +198,10 @@ define @vp_nearbyint_nxv8bf16_unmasked( @vp_nearbyint_nxv16bf16( %va ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -250,10 +250,10 @@ define @vp_nearbyint_nxv16bf16_unmasked( @llvm.vp.nearbyint.nxv32bf16( @vp_nearbyint_nxv32bf16( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv32bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v17, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v17 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v0, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t ; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: bltu a0, a1, .LBB10_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB10_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t @@ -332,12 +322,6 @@ define @vp_nearbyint_nxv32bf16( %va ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv32bf16( %va, %m, i32 %evl) ret %v @@ -346,51 +330,41 @@ define @vp_nearbyint_nxv32bf16( %va define @vp_nearbyint_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -402,12 +376,6 @@ define @vp_nearbyint_nxv32bf16_unmasked( @llvm.vp.nearbyint.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -436,10 +404,10 @@ define @vp_nearbyint_nxv1f16( %va, @vp_nearbyint_nxv1f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -518,10 +486,10 @@ define @vp_nearbyint_nxv2f16( %va, @vp_nearbyint_nxv2f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -601,10 +569,10 @@ define @vp_nearbyint_nxv4f16( %va, @vp_nearbyint_nxv4f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -665,9 +633,9 @@ declare @llvm.vp.nearbyint.nxv8f16(, @vp_nearbyint_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -687,10 +655,10 @@ define @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -751,9 +719,9 @@ declare @llvm.vp.nearbyint.nxv16f16(, < define @vp_nearbyint_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -773,10 +741,10 @@ define @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -837,9 +805,9 @@ declare @llvm.vp.nearbyint.nxv32f16(, < define @vp_nearbyint_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -856,50 +824,40 @@ define @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @llvm.vp.nearbyint.nxv32f16( %va, %m, i32 %evl) ret %v @@ -944,51 +896,41 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: frflags a2 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: fsflags a2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -1000,12 +942,6 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: fsflags a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1274,9 +1210,9 @@ declare @llvm.vp.nearbyint.nxv2f64(, define @vp_nearbyint_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1318,9 +1254,9 @@ declare @llvm.vp.nearbyint.nxv4f64(, define @vp_nearbyint_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1362,9 +1298,9 @@ declare @llvm.vp.nearbyint.nxv7f64(, define @vp_nearbyint_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1406,9 +1342,9 @@ declare @llvm.vp.nearbyint.nxv8f64(, define @vp_nearbyint_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1453,12 +1389,12 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v6, v0, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI44_0) +; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI44_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1501,12 +1437,12 @@ define @vp_nearbyint_nxv16f64_unmasked( @test_mulhs_expand( %broadcast.splat ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a1, a0, 1365 +; CHECK-NEXT: addi a1, sp, 8 +; CHECK-NEXT: addi a2, a0, 1365 ; CHECK-NEXT: addi a0, a0, 1366 ; CHECK-NEXT: sw a0, 8(sp) -; CHECK-NEXT: sw a1, 12(sp) -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; CHECK-NEXT: vlse64.v v12, (a0), zero +; CHECK-NEXT: sw a2, 12(sp) +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v12, (a1), zero ; CHECK-NEXT: vrgather.vi v16, v8, 0 -; CHECK-NEXT: vmulh.vv v8, v16, v12 ; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmulh.vv v8, v16, v12 ; CHECK-NEXT: vsrl.vx v12, v8, a0 ; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: li a0, 3 @@ -73,13 +73,13 @@ define @test_mulhu_expand( %broadcast.splat ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: lui a0, 699051 -; CHECK-NEXT: addi a1, a0, -1366 +; CHECK-NEXT: addi a1, sp, 8 +; CHECK-NEXT: addi a2, a0, -1366 ; CHECK-NEXT: addi a0, a0, -1365 ; CHECK-NEXT: sw a0, 8(sp) -; CHECK-NEXT: sw a1, 12(sp) -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; CHECK-NEXT: vlse64.v v12, (a0), zero +; CHECK-NEXT: sw a2, 12(sp) +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v12, (a1), zero ; CHECK-NEXT: vrgather.vi v16, v8, 0 ; CHECK-NEXT: vmulhu.vv v8, v16, v12 ; CHECK-NEXT: vsrl.vi v8, v8, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/pr52475.ll b/llvm/test/CodeGen/RISCV/rvv/pr52475.ll index a885f23eef39e..bca4d9c24d600 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr52475.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr52475.ll @@ -9,8 +9,8 @@ define <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a2, a1, 448 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: vle32.v v8, (a2) ; CHECK-NEXT: addi a2, a0, 448 ; CHECK-NEXT: vse32.v v8, (a2) diff --git a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll index c5fd6943e51be..6b08c4409fb63 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll @@ -5,14 +5,15 @@ define @foo(ptr %p) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vsll.vi v8, v8, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: li a0, 248 -; CHECK-NEXT: vand.vx v8, v10, a0 ; CHECK-NEXT: lui a0, 4 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: li a0, 248 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsll.vi v8, v8, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vand.vx v8, v12, a0 ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: addi a0, a0, -361 ; CHECK-NEXT: vmacc.vx v10, a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll index e8a8d9e422ac1..37c67b9ff2f6a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll @@ -23,13 +23,13 @@ define i1 @foo( %x, i64 %y) { ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: andi sp, sp, -64 +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: addi a2, sp, 64 -; CHECK-NEXT: add a0, a2, a0 ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 ; CHECK-NEXT: vs8r.v v24, (a1) ; CHECK-NEXT: vmv1r.v v0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index fd3c4593462bf..06a357eeaeb61 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll @@ -35,8 +35,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, This Loop Header: Depth=2 @@ -66,8 +66,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, This Loop Header: Depth=4 ; CHECK-NEXT: # Child Loop BB0_5 Depth 5 -; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: .LBB0_5: # %vector.body.i ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_2 Depth=2 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=3 ; CHECK-NEXT: # Parent Loop BB0_4 Depth=4 ; CHECK-NEXT: # => This Inner Loop Header: Depth=5 -; CHECK-NEXT: addi s1, a5, 4 -; CHECK-NEXT: add a1, a4, a5 +; CHECK-NEXT: addi a5, a1, 4 +; CHECK-NEXT: add a4, s8, a1 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: vse32.v v8, (a4), v0.t ; CHECK-NEXT: vse32.v v8, (a1), v0.t -; CHECK-NEXT: add a5, a5, a3 -; CHECK-NEXT: vse32.v v8, (a5), v0.t -; CHECK-NEXT: mv a5, s1 -; CHECK-NEXT: bne s1, s0, .LBB0_5 +; CHECK-NEXT: mv a1, a5 +; CHECK-NEXT: bne a5, s0, .LBB0_5 ; CHECK-NEXT: # %bb.6: # %for.cond.cleanup15.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=4 -; CHECK-NEXT: addi s9, s9, 4 -; CHECK-NEXT: addi a4, a4, 4 +; CHECK-NEXT: addi s1, s1, 4 +; CHECK-NEXT: addi s8, s8, 4 ; CHECK-NEXT: addi ra, ra, 4 ; CHECK-NEXT: addi a3, a3, 4 ; CHECK-NEXT: andi s10, a0, 1 @@ -110,8 +110,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, @vp_rint_nxv1bf16( %va, @vp_rint_nxv1bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -65,10 +65,10 @@ define @vp_rint_nxv2bf16( %va, @vp_rint_nxv2bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -112,10 +112,10 @@ define @vp_rint_nxv4bf16( %va, @vp_rint_nxv4bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -160,10 +160,10 @@ define @vp_rint_nxv8bf16( %va, @vp_rint_nxv8bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -208,10 +208,10 @@ define @vp_rint_nxv16bf16( %va, @vp_rint_nxv16bf16_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -259,53 +259,47 @@ define @vp_rint_nxv32bf16( %va, @vp_rint_nxv32bf16( %va, @vp_rint_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -379,12 +363,6 @@ define @vp_rint_nxv32bf16_unmasked( ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.rint.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -411,10 +389,10 @@ define @vp_rint_nxv1f16( %va, @vp_rint_nxv1f16_unmasked( %va, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -485,10 +463,10 @@ define @vp_rint_nxv2f16( %va, @vp_rint_nxv2f16_unmasked( %va, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -560,10 +538,10 @@ define @vp_rint_nxv4f16( %va, @vp_rint_nxv4f16_unmasked( %va, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -618,9 +596,9 @@ declare @llvm.vp.rint.nxv8f16(, @vp_rint_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -638,10 +616,10 @@ define @vp_rint_nxv8f16( %va, @vp_rint_nxv8f16_unmasked( %va, i3 ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -696,9 +674,9 @@ declare @llvm.vp.rint.nxv16f16(, @vp_rint_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -716,10 +694,10 @@ define @vp_rint_nxv16f16( %va, @vp_rint_nxv16f16_unmasked( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -774,9 +752,9 @@ declare @llvm.vp.rint.nxv32f16(, @vp_rint_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -797,53 +775,47 @@ define @vp_rint_nxv32f16( %va, @vp_rint_nxv32f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vp_rint_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -930,12 +892,6 @@ define @vp_rint_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1180,9 +1136,9 @@ declare @llvm.vp.rint.nxv2f64(, @vp_rint_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1220,9 +1176,9 @@ declare @llvm.vp.rint.nxv4f64(, @vp_rint_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1260,9 +1216,9 @@ declare @llvm.vp.rint.nxv7f64(, @vp_rint_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1300,9 +1256,9 @@ declare @llvm.vp.rint.nxv8f64(, @vp_rint_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1349,12 +1305,12 @@ define @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64_unmasked( ; CHECK-LABEL: vp_rint_nxv16f64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI45_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI45_0) +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll index 4c5835afd49e6..3bbdd1a257fdb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll @@ -8,17 +8,17 @@ define float @reduce_fadd(ptr %f) { ; CHECK-LABEL: reduce_fadd: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: srli a1, a2, 1 ; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: li a3, 1024 ; CHECK-NEXT: .LBB0_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v10, (a0) +; CHECK-NEXT: sub a3, a3, a1 ; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma ; CHECK-NEXT: vfredosum.vs v8, v10, v8 -; CHECK-NEXT: sub a3, a3, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: bnez a3, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll index 9ed28248e0cc1..8a10e75333ad0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll @@ -19,10 +19,10 @@ define @vp_round_nxv1bf16( %va, @vp_round_nxv1bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -69,10 +69,10 @@ define @vp_round_nxv2bf16( %va, @vp_round_nxv2bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -120,10 +120,10 @@ define @vp_round_nxv4bf16( %va, @vp_round_nxv4bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -172,10 +172,10 @@ define @vp_round_nxv8bf16( %va, @vp_round_nxv8bf16_unmasked( % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -224,10 +224,10 @@ define @vp_round_nxv16bf16( %va, @vp_round_nxv16bf16_unmasked( @vp_round_nxv32bf16( %va, @vp_round_nxv32bf16( %va, @vp_round_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 4 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -410,12 +392,6 @@ define @vp_round_nxv32bf16_unmasked( @llvm.vp.round.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -444,10 +420,10 @@ define @vp_round_nxv1f16( %va, @vp_round_nxv1f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -526,10 +502,10 @@ define @vp_round_nxv2f16( %va, @vp_round_nxv2f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -609,10 +585,10 @@ define @vp_round_nxv4f16( %va, @vp_round_nxv4f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -673,9 +649,9 @@ declare @llvm.vp.round.nxv8f16(, @vp_round_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -695,10 +671,10 @@ define @vp_round_nxv8f16( %va, @vp_round_nxv8f16_unmasked( %va, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -759,9 +735,9 @@ declare @llvm.vp.round.nxv16f16(, @vp_round_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -781,10 +757,10 @@ define @vp_round_nxv16f16( %va, @vp_round_nxv16f16_unmasked( %va ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -845,9 +821,9 @@ declare @llvm.vp.round.nxv32f16(, @vp_round_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -870,62 +846,54 @@ define @vp_round_nxv32f16( %va, @vp_round_nxv32f16_unmasked( %va ; ; ZVFHMIN-LABEL: vp_round_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a2, 4 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a2 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -1016,12 +974,6 @@ define @vp_round_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1290,9 +1242,9 @@ declare @llvm.vp.round.nxv2f64(, @vp_round_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1334,9 +1286,9 @@ declare @llvm.vp.round.nxv4f64(, @vp_round_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1378,9 +1330,9 @@ declare @llvm.vp.round.nxv7f64(, @vp_round_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1422,9 +1374,9 @@ declare @llvm.vp.round.nxv8f64(, @vp_round_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1475,12 +1427,12 @@ define @vp_round_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_round_nxv16f64_unmasked( ; CHECK-LABEL: vp_round_nxv16f64_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI45_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI45_0) +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll index 3fdb354bff94b..4cd909e4b0a63 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll @@ -19,10 +19,10 @@ define @vp_roundeven_nxv1bf16( %va, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 @@ -44,10 +44,10 @@ define @vp_roundeven_nxv1bf16_unmasked( @vp_roundeven_nxv2bf16( %va, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 @@ -94,10 +94,10 @@ define @vp_roundeven_nxv2bf16_unmasked( @vp_roundeven_nxv4bf16( %va, < ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 @@ -146,10 +146,10 @@ define @vp_roundeven_nxv4bf16_unmasked( @vp_roundeven_nxv8bf16( %va, < ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 @@ -198,10 +198,10 @@ define @vp_roundeven_nxv8bf16_unmasked( @vp_roundeven_nxv16bf16( %va ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 @@ -250,10 +250,10 @@ define @vp_roundeven_nxv16bf16_unmasked( @vp_roundeven_nxv32bf16( %va ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v17, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v17 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v0, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: bltu a0, a1, .LBB10_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB10_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 -; CHECK-NEXT: vmv1r.v v8, v16 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t @@ -354,51 +346,41 @@ define @vp_roundeven_nxv32bf16( %va define @vp_roundeven_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -410,12 +392,6 @@ define @vp_roundeven_nxv32bf16_unmasked( @llvm.vp.roundeven.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -444,10 +420,10 @@ define @vp_roundeven_nxv1f16( %va, @vp_roundeven_nxv1f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -526,10 +502,10 @@ define @vp_roundeven_nxv2f16( %va, @vp_roundeven_nxv2f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -609,10 +585,10 @@ define @vp_roundeven_nxv4f16( %va, @vp_roundeven_nxv4f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -673,9 +649,9 @@ declare @llvm.vp.roundeven.nxv8f16(, @vp_roundeven_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -695,10 +671,10 @@ define @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16_unmasked( %v ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -759,9 +735,9 @@ declare @llvm.vp.roundeven.nxv16f16(, < define @vp_roundeven_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -781,10 +757,10 @@ define @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t @@ -845,9 +821,9 @@ declare @llvm.vp.roundeven.nxv32f16(, < define @vp_roundeven_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -870,62 +846,54 @@ define @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16_unmasked( ; ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v24, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v16, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a2, 0 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a2 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB23_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 @@ -1016,12 +974,6 @@ define @vp_roundeven_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1290,9 +1242,9 @@ declare @llvm.vp.roundeven.nxv2f64(, define @vp_roundeven_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1334,9 +1286,9 @@ declare @llvm.vp.roundeven.nxv4f64(, define @vp_roundeven_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1378,9 +1330,9 @@ declare @llvm.vp.roundeven.nxv7f64(, define @vp_roundeven_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1422,9 +1374,9 @@ declare @llvm.vp.roundeven.nxv8f64(, define @vp_roundeven_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1475,12 +1427,12 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v6, v0, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI44_0) +; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI44_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1501,23 +1453,26 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -1533,12 +1488,12 @@ define @vp_roundeven_nxv16f64_unmasked( @vp_roundtozero_nxv1bf16( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 @@ -44,10 +44,10 @@ define @vp_roundtozero_nxv1bf16_unmasked( @vp_roundtozero_nxv2bf16( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 @@ -94,10 +94,10 @@ define @vp_roundtozero_nxv2bf16_unmasked( @vp_roundtozero_nxv4bf16( %va, ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 @@ -146,10 +146,10 @@ define @vp_roundtozero_nxv4bf16_unmasked( @vp_roundtozero_nxv8bf16( %va, ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 @@ -198,10 +198,10 @@ define @vp_roundtozero_nxv8bf16_unmasked( @vp_roundtozero_nxv16bf16( % ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: lui a0, 307200 -; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fmv.w.x fa5, a1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 @@ -250,10 +250,10 @@ define @vp_roundtozero_nxv16bf16_unmasked( @vp_roundtozero_nxv32bf16( % ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v17, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v17 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v0, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: bltu a0, a1, .LBB10_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB10_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 -; CHECK-NEXT: vmv1r.v v8, v16 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t @@ -354,51 +346,41 @@ define @vp_roundtozero_nxv32bf16( % define @vp_roundtozero_nxv32bf16_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv32bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: lui a3, 307200 ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: fmv.w.x fa5, a3 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v24, v0.t -; CHECK-NEXT: lui a2, 307200 -; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vslidedown.vx v12, v24, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: bltu a0, a1, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 @@ -410,12 +392,6 @@ define @vp_roundtozero_nxv32bf16_unmasked( @llvm.vp.roundtozero.nxv32bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -444,10 +420,10 @@ define @vp_roundtozero_nxv1f16( %va, @vp_roundtozero_nxv1f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -526,10 +502,10 @@ define @vp_roundtozero_nxv2f16( %va, @vp_roundtozero_nxv2f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t @@ -609,10 +585,10 @@ define @vp_roundtozero_nxv4f16( %va, @vp_roundtozero_nxv4f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t @@ -673,9 +649,9 @@ declare @llvm.vp.roundtozero.nxv8f16(, @vp_roundtozero_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv8f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) -; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu @@ -695,10 +671,10 @@ define @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16_unmasked( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t @@ -759,9 +735,9 @@ declare @llvm.vp.roundtozero.nxv16f16(, define @vp_roundtozero_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv16f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) -; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu @@ -781,10 +757,10 @@ define @vp_roundtozero_nxv16f16( %va, < ; ZVFHMIN-NEXT: vmv1r.v v12, v0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t -; ZVFHMIN-NEXT: lui a0, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v12, v24, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 @@ -822,10 +798,10 @@ define @vp_roundtozero_nxv16f16_unmasked( @llvm.vp.roundtozero.nxv32f16(, define @vp_roundtozero_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv32f16: ; ZVFH: # %bb.0: +; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) -; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu @@ -870,62 +846,54 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: lui a3, 307200 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmv1r.v v16, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a3 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v17, v0, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vmv1r.v v0, v17 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t -; ZVFHMIN-NEXT: lui a2, 307200 -; ZVFHMIN-NEXT: fmv.w.x fa5, a2 +; ZVFHMIN-NEXT: vslidedown.vx v12, v0, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a2, 1 -; ZVFHMIN-NEXT: vmv1r.v v0, v17 +; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: fsrm a2 -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB22_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 -; ZVFHMIN-NEXT: vmv1r.v v8, v16 -; ZVFHMIN-NEXT: vmv1r.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vmv1r.v v0, v7 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t +; ZVFHMIN-NEXT: vmflt.vf v7, v16, fa5, v0.t ; ZVFHMIN-NEXT: fsrmi a0, 1 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmv1r.v v0, v7 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: fsrm a0 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t @@ -960,51 +928,41 @@ define @vp_roundtozero_nxv32f16_unmasked( @vp_roundtozero_nxv32f16_unmasked( @llvm.vp.roundtozero.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v @@ -1290,9 +1242,9 @@ declare @llvm.vp.roundtozero.nxv2f64( define @vp_roundtozero_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) -; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu @@ -1334,9 +1286,9 @@ declare @llvm.vp.roundtozero.nxv4f64( define @vp_roundtozero_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) -; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu @@ -1378,9 +1330,9 @@ declare @llvm.vp.roundtozero.nxv7f64( define @vp_roundtozero_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv7f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1422,9 +1374,9 @@ declare @llvm.vp.roundtozero.nxv8f64( define @vp_roundtozero_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) -; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu @@ -1475,12 +1427,12 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v6, v0, a2 +; CHECK-NEXT: lui a2, %hi(.LCPI44_0) +; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI44_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1501,23 +1453,26 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -1533,12 +1488,12 @@ define @vp_roundtozero_nxv16f64_unmasked( @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, ; CHECK-NEXT: mv s1, sp ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: addi t0, s1, 64 -; CHECK-NEXT: vs8r.v v8, (t0) ; CHECK-NEXT: csrr t1, vlenb ; CHECK-NEXT: slli t1, t1, 3 ; CHECK-NEXT: add t1, s1, t1 ; CHECK-NEXT: addi t1, t1, 64 +; CHECK-NEXT: vs8r.v v8, (t0) ; CHECK-NEXT: vs8r.v v8, (t1) ; CHECK-NEXT: sd t1, 0(sp) ; CHECK-NEXT: sd t0, 8(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll index ab64459944885..b9432bc568d9c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll @@ -24,8 +24,8 @@ define void @rvv_vla(i64 %n, i64 %i) nounwind { ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: sub a2, s0, a2 ; CHECK-NEXT: addi a2, a2, -32 -; CHECK-NEXT: vl2re64.v v8, (a2) ; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: vl2re64.v v8, (a2) ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: lw zero, 0(a0) ; CHECK-NEXT: addi sp, s0, -32 @@ -108,9 +108,9 @@ define void @rvv_vla_and_overaligned(i64 %n, i64 %i) nounwind { ; CHECK-NEXT: addi a2, a2, 112 ; CHECK-NEXT: vl1re64.v v8, (a2) ; CHECK-NEXT: addi a2, s1, 112 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: vl2re64.v v8, (a2) ; CHECK-NEXT: lw zero, 64(s1) -; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: lw zero, 0(a0) ; CHECK-NEXT: addi sp, s0, -144 diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 24b86b28e9a2c..6c11e9413525e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -269,13 +269,13 @@ define @fcmp_one_vf_nxv1bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t -; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -288,13 +288,13 @@ define @fcmp_one_vf_swap_nxv1bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t -; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmflt.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmflt.vv v9, v9, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -307,13 +307,11 @@ define @fcmp_ord_vv_nxv1bf16( %va, @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ord", %m, i32 %evl) ret %v @@ -324,15 +322,15 @@ define @fcmp_ord_vf_nxv1bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmfeq.vv v9, v9, v9, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -345,15 +343,15 @@ define @fcmp_ord_vf_swap_nxv1bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmfeq.vv v9, v9, v9, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -381,13 +379,13 @@ define @fcmp_ueq_vf_nxv1bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t -; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -400,13 +398,13 @@ define @fcmp_ueq_vf_swap_nxv1bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t -; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmflt.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmflt.vv v9, v9, v10, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -666,13 +664,11 @@ define @fcmp_uno_vv_nxv1bf16( %va, @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"uno", %m, i32 %evl) ret %v @@ -683,15 +679,15 @@ define @fcmp_uno_vf_nxv1bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmfne.vv v9, v9, v9, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -704,15 +700,15 @@ define @fcmp_uno_vf_swap_nxv1bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmfne.vv v9, v9, v9, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1008,9 +1004,9 @@ define @fcmp_one_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t ; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t @@ -1027,9 +1023,9 @@ define @fcmp_one_vf_swap_nxv8bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t ; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t @@ -1063,15 +1059,15 @@ define @fcmp_ord_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmfeq.vv v10, v12, v12, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmand.mm v0, v10, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1084,15 +1080,15 @@ define @fcmp_ord_vf_swap_nxv8bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmfeq.vv v10, v12, v12, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1120,9 +1116,9 @@ define @fcmp_ueq_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t ; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t @@ -1139,9 +1135,9 @@ define @fcmp_ueq_vf_swap_nxv8bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t ; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t @@ -1425,15 +1421,15 @@ define @fcmp_uno_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmfne.vv v10, v12, v12, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1446,15 +1442,15 @@ define @fcmp_uno_vf_swap_nxv8bf16( %va, b ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmfne.vv v10, v12, v12, v0.t ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1470,188 +1466,177 @@ define @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_one_vf_nxv1f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmflt.vv v9, v10, v8, v0.t -; ZVFHMIN-NEXT: vmflt.vv v8, v8, v10, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmflt.vv v8, v9, v10, v0.t +; ZVFHMIN-NEXT: vmflt.vv v9, v10, v9, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2052,13 +2037,13 @@ define @fcmp_one_vf_swap_nxv1f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmflt.vv v9, v8, v10, v0.t -; ZVFHMIN-NEXT: vmflt.vv v8, v10, v8, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmflt.vv v8, v10, v9, v0.t +; ZVFHMIN-NEXT: vmflt.vv v9, v9, v10, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2079,13 +2064,11 @@ define @fcmp_ord_vv_nxv1f16( %va, @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ord", %m, i32 %evl) ret %v @@ -2106,15 +2089,15 @@ define @fcmp_ord_vf_nxv1f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v9, v9, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2137,15 +2120,15 @@ define @fcmp_ord_vf_swap_nxv1f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v9, v9, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2189,13 +2172,13 @@ define @fcmp_ueq_vf_nxv1f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmflt.vv v9, v10, v8, v0.t -; ZVFHMIN-NEXT: vmflt.vv v8, v8, v10, v0.t -; ZVFHMIN-NEXT: vmnor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmflt.vv v8, v9, v10, v0.t +; ZVFHMIN-NEXT: vmflt.vv v9, v10, v9, v0.t +; ZVFHMIN-NEXT: vmnor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2216,13 +2199,13 @@ define @fcmp_ueq_vf_swap_nxv1f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmflt.vv v9, v8, v10, v0.t -; ZVFHMIN-NEXT: vmflt.vv v8, v10, v8, v0.t -; ZVFHMIN-NEXT: vmnor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmflt.vv v8, v10, v9, v0.t +; ZVFHMIN-NEXT: vmflt.vv v9, v9, v10, v0.t +; ZVFHMIN-NEXT: vmnor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2592,13 +2575,11 @@ define @fcmp_uno_vv_nxv1f16( %va, @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"uno", %m, i32 %evl) ret %v @@ -2619,15 +2600,15 @@ define @fcmp_uno_vf_nxv1f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfne.vv v9, v9, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2650,15 +2631,15 @@ define @fcmp_uno_vf_swap_nxv1f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmfne.vv v9, v9, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3081,9 +3062,9 @@ define @fcmp_one_vf_nxv8f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v16, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v16, v12, v0.t @@ -3108,9 +3089,9 @@ define @fcmp_one_vf_swap_nxv8f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v16, v0.t @@ -3162,15 +3143,15 @@ define @fcmp_ord_vf_nxv8f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v10, v12, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v12, v12, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3193,15 +3174,15 @@ define @fcmp_ord_vf_swap_nxv8f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v10, v12, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v12, v12, v0.t -; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmand.mm v0, v8, v10 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3245,9 +3226,9 @@ define @fcmp_ueq_vf_nxv8f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v16, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v16, v12, v0.t @@ -3272,9 +3253,9 @@ define @fcmp_ueq_vf_swap_nxv8f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v16, v0.t @@ -3681,15 +3662,15 @@ define @fcmp_uno_vf_nxv8f16( %va, half %b, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfne.vv v10, v12, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v12, v12, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3712,15 +3693,15 @@ define @fcmp_uno_vf_swap_nxv8f16( %va, half ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfne.vv v10, v12, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v12, v12, v0.t -; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmor.mm v0, v8, v10 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3748,18 +3729,18 @@ define @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_one_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v12, v16 ; CHECK-NEXT: vmflt.vv v9, v16, v12 @@ -439,9 +439,9 @@ define @fcmp_one_fv_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v16, v12 ; CHECK-NEXT: vmflt.vv v9, v12, v16 @@ -505,15 +505,15 @@ define @fcmp_ord_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmfeq.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v12, v12 -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmand.mm v0, v10, v8 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -526,15 +526,15 @@ define @fcmp_ord_fv_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmfeq.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v12, v12 -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmand.mm v0, v8, v10 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -564,15 +564,15 @@ define @fcmp_ord_vf_nxv8bf16_nonans( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmfeq.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v12, v12 -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmand.mm v0, v10, v8 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -600,9 +600,9 @@ define @fcmp_ueq_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v12, v16 ; CHECK-NEXT: vmflt.vv v9, v16, v12 @@ -619,9 +619,9 @@ define @fcmp_ueq_fv_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v8, v16, v12 ; CHECK-NEXT: vmflt.vv v9, v12, v16 @@ -1082,15 +1082,15 @@ define @fcmp_uno_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmfne.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v12, v12 -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1103,15 +1103,15 @@ define @fcmp_uno_fv_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmfne.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v12, v12 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmor.mm v0, v8, v10 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1141,15 +1141,15 @@ define @fcmp_uno_vf_nxv8bf16_nonans( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmfne.vv v10, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v12, v12 -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1728,9 +1728,9 @@ define @fcmp_one_vf_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v16 ; ZVFHMIN-NEXT: vmflt.vv v9, v16, v12 @@ -1755,9 +1755,9 @@ define @fcmp_one_fv_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12 ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v16 @@ -1850,15 +1850,15 @@ define @fcmp_ord_vf_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfeq.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmand.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1880,15 +1880,15 @@ define @fcmp_ord_fv_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfeq.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmand.mm v0, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1935,15 +1935,15 @@ define @fcmp_ord_vf_nxv8f16_nonans( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfeq.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmand.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -1987,9 +1987,9 @@ define @fcmp_ueq_vf_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v16 ; ZVFHMIN-NEXT: vmflt.vv v9, v16, v12 @@ -2014,9 +2014,9 @@ define @fcmp_ueq_fv_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12 ; ZVFHMIN-NEXT: vmflt.vv v9, v12, v16 @@ -2668,15 +2668,15 @@ define @fcmp_uno_vf_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfne.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -2698,15 +2698,15 @@ define @fcmp_uno_fv_nxv8f16( %va, half %b) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfne.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmor.mm v0, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -2753,15 +2753,15 @@ define @fcmp_uno_vf_nxv8f16_nonans( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmfne.vv v10, v12, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v12, v12 -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v12, v12 +; ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -4443,10 +4443,10 @@ define @fcmp_oeq_vf_nx16f64( %va) { ; RV32-LABEL: fcmp_oeq_vf_nx16f64: ; RV32: # %bb.0: ; RV32-NEXT: fcvt.d.w fa5, zero -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vmfeq.vf v24, v16, fa5 ; RV32-NEXT: vmfeq.vf v0, v8, fa5 -; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: srli a0, a0, 3 ; RV32-NEXT: add a1, a0, a0 ; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma @@ -4456,10 +4456,10 @@ define @fcmp_oeq_vf_nx16f64( %va) { ; RV64-LABEL: fcmp_oeq_vf_nx16f64: ; RV64: # %bb.0: ; RV64-NEXT: fmv.d.x fa5, zero -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vmfeq.vf v24, v16, fa5 ; RV64-NEXT: vmfeq.vf v0, v8, fa5 -; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: srli a0, a0, 3 ; RV64-NEXT: add a1, a0, a0 ; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma @@ -4469,10 +4469,10 @@ define @fcmp_oeq_vf_nx16f64( %va) { ; ZVFHMIN32-LABEL: fcmp_oeq_vf_nx16f64: ; ZVFHMIN32: # %bb.0: ; ZVFHMIN32-NEXT: fcvt.d.w fa5, zero -; ZVFHMIN32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; ZVFHMIN32-NEXT: vmfeq.vf v24, v16, fa5 ; ZVFHMIN32-NEXT: vmfeq.vf v0, v8, fa5 -; ZVFHMIN32-NEXT: csrr a0, vlenb ; ZVFHMIN32-NEXT: srli a0, a0, 3 ; ZVFHMIN32-NEXT: add a1, a0, a0 ; ZVFHMIN32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma @@ -4482,10 +4482,10 @@ define @fcmp_oeq_vf_nx16f64( %va) { ; ZVFHMIN64-LABEL: fcmp_oeq_vf_nx16f64: ; ZVFHMIN64: # %bb.0: ; ZVFHMIN64-NEXT: fmv.d.x fa5, zero -; ZVFHMIN64-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; ZVFHMIN64-NEXT: vmfeq.vf v24, v16, fa5 ; ZVFHMIN64-NEXT: vmfeq.vf v0, v8, fa5 -; ZVFHMIN64-NEXT: csrr a0, vlenb ; ZVFHMIN64-NEXT: srli a0, a0, 3 ; ZVFHMIN64-NEXT: add a1, a0, a0 ; ZVFHMIN64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll index 8039aa368b7cc..e8099c2d08a5f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll @@ -558,8 +558,8 @@ define @icmp_eq_vx_nxv8i7( %va, i7 %b, @icmp_eq_vx_swap_nxv8i7( %va, i7 %b, < ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma ; CHECK-NEXT: vmseq.vv v0, v9, v8, v0.t @@ -1099,18 +1099,18 @@ define @icmp_eq_vv_nxv128i8( %va, @icmp_eq_vv_nxv32i32( %va, @icmp_eq_vx_nxv32i32( %va, i32 %b, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 +; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a4, a1, a3 ; CHECK-NEXT: sltu a5, a1, a4 ; CHECK-NEXT: addi a5, a5, -1 @@ -2335,9 +2335,9 @@ define @icmp_eq_vx_swap_nxv32i32( %va, i32 ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 +; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a4, a1, a3 ; CHECK-NEXT: sltu a5, a1, a4 ; CHECK-NEXT: addi a5, a5, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll index 1910953307e5d..bd3c29b0c6efc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll @@ -2998,10 +2998,10 @@ define @icmp_eq_vi_nx16i64( %va) { ; CHECK-LABEL: icmp_eq_vi_nx16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-NEXT: vmseq.vi v24, v16, 0 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma -; CHECK-NEXT: vmseq.vi v24, v16, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vx v0, v24, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index abf89361cdea5..c91b02e8f15e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -244,34 +244,34 @@ define void @sink_splat_mul_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_mul_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB7_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB7_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB7_5 ; CHECK-NEXT: .LBB7_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB7_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vmul.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB7_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB7_7 ; CHECK-NEXT: .LBB7_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB7_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -335,34 +335,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_add_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB8_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB8_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB8_5 ; CHECK-NEXT: .LBB8_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB8_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB8_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB8_7 ; CHECK-NEXT: .LBB8_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB8_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -426,34 +426,34 @@ define void @sink_splat_sub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB9_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB9_5 ; CHECK-NEXT: .LBB9_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB9_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vsub.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB9_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB9_7 ; CHECK-NEXT: .LBB9_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB9_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -517,34 +517,34 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_rsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB10_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB10_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB10_5 ; CHECK-NEXT: .LBB10_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB10_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB10_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB10_7 ; CHECK-NEXT: .LBB10_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB10_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -608,34 +608,34 @@ define void @sink_splat_and_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_and_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB11_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB11_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB11_5 ; CHECK-NEXT: .LBB11_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB11_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vand.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB11_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB11_7 ; CHECK-NEXT: .LBB11_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB11_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -699,34 +699,34 @@ define void @sink_splat_or_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_or_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB12_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB12_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB12_5 ; CHECK-NEXT: .LBB12_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB12_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vor.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB12_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB12_7 ; CHECK-NEXT: .LBB12_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB12_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -790,34 +790,34 @@ define void @sink_splat_xor_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_xor_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB13_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB13_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB13_5 ; CHECK-NEXT: .LBB13_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB13_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB13_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB13_7 ; CHECK-NEXT: .LBB13_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB13_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -983,34 +983,34 @@ define void @sink_splat_shl_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_shl_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB17_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB17_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB17_5 ; CHECK-NEXT: .LBB17_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB17_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vsll.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB17_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB17_7 ; CHECK-NEXT: .LBB17_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB17_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1074,34 +1074,34 @@ define void @sink_splat_lshr_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_lshr_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB18_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB18_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB18_5 ; CHECK-NEXT: .LBB18_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB18_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vsrl.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB18_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB18_7 ; CHECK-NEXT: .LBB18_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB18_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1182,17 +1182,17 @@ define void @sink_splat_ashr_scalable(ptr nocapture %a) { ; CHECK-NEXT: .LBB19_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: vsra.vi v8, v8, 2 ; CHECK-NEXT: vs2r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a4 ; CHECK-NEXT: bnez a6, .LBB19_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a3, .LBB19_7 ; CHECK-NEXT: .LBB19_5: # %for.body.preheader ; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB19_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1458,33 +1458,33 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB26_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfmul.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB26_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1548,33 +1548,33 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB27_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB27_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1638,33 +1638,33 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB28_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB28_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1728,33 +1728,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB29_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfadd.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB29_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1818,33 +1818,33 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB30_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB30_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1908,33 +1908,33 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB31_2 +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a3 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a3, 2 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: slli a1, a2, 2 ; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB31_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2004,9 +2004,9 @@ define void @sink_splat_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vse32.v v9, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a1, a2, .LBB32_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2042,9 +2042,9 @@ define void @sink_splat_fma_commute(ptr noalias nocapture %a, ptr nocapture read ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vse32.v v9, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a1, a2, .LBB33_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2074,45 +2074,45 @@ define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-LABEL: sink_splat_fma_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a4, 1024 -; CHECK-NEXT: bgeu a4, a3, .LBB34_2 +; CHECK-NEXT: srli a4, a2, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a4, .LBB34_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB34_5 ; CHECK-NEXT: .LBB34_2: # %vector.ph -; CHECK-NEXT: addi a4, a3, -1 -; CHECK-NEXT: andi a5, a4, 1024 -; CHECK-NEXT: xori a4, a5, 1024 +; CHECK-NEXT: addi a3, a4, -1 +; CHECK-NEXT: andi a5, a3, 1024 +; CHECK-NEXT: xori a3, a5, 1024 ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a4 +; CHECK-NEXT: mv t0, a3 ; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) +; CHECK-NEXT: sub t0, t0, a4 +; CHECK-NEXT: add a7, a7, a2 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a3 -; CHECK-NEXT: add a7, a7, a2 ; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB34_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB34_7 ; CHECK-NEXT: .LBB34_5: # %for.body.preheader -; CHECK-NEXT: slli a4, a4, 2 -; CHECK-NEXT: add a2, a1, a4 -; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: slli a2, a3, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a2, a1, a2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: .LBB34_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a2) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: fmadd.s fa5, fa5, fa0, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: bne a2, a1, .LBB34_6 ; CHECK-NEXT: .LBB34_7: # %for.cond.cleanup @@ -2174,45 +2174,45 @@ define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noali ; CHECK-LABEL: sink_splat_fma_commute_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a4, 1024 -; CHECK-NEXT: bgeu a4, a3, .LBB35_2 +; CHECK-NEXT: srli a4, a2, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a4, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB35_5 ; CHECK-NEXT: .LBB35_2: # %vector.ph -; CHECK-NEXT: addi a4, a3, -1 -; CHECK-NEXT: andi a5, a4, 1024 -; CHECK-NEXT: xori a4, a5, 1024 +; CHECK-NEXT: addi a3, a4, -1 +; CHECK-NEXT: andi a5, a3, 1024 +; CHECK-NEXT: xori a3, a5, 1024 ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a4 +; CHECK-NEXT: mv t0, a3 ; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) +; CHECK-NEXT: sub t0, t0, a4 +; CHECK-NEXT: add a7, a7, a2 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a3 -; CHECK-NEXT: add a7, a7, a2 ; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB35_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB35_7 ; CHECK-NEXT: .LBB35_5: # %for.body.preheader -; CHECK-NEXT: slli a4, a4, 2 -; CHECK-NEXT: add a2, a1, a4 -; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: slli a2, a3, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a2, a1, a2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: .LBB35_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a2) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: fmadd.s fa5, fa0, fa5, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: bne a2, a1, .LBB35_6 ; CHECK-NEXT: .LBB35_7: # %for.cond.cleanup @@ -2487,34 +2487,34 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_udiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB42_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB42_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB42_5 ; CHECK-NEXT: .LBB42_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB42_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdivu.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB42_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB42_7 ; CHECK-NEXT: .LBB42_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB42_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2578,34 +2578,34 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sdiv_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB43_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB43_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB43_5 ; CHECK-NEXT: .LBB43_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB43_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdiv.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB43_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB43_7 ; CHECK-NEXT: .LBB43_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB43_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2669,34 +2669,34 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_urem_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB44_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB44_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB44_5 ; CHECK-NEXT: .LBB44_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB44_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vremu.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB44_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB44_7 ; CHECK-NEXT: .LBB44_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB44_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2760,34 +2760,34 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_srem_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a2, a5, 1 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a2, .LBB45_2 +; CHECK-NEXT: srli a3, a5, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB45_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB45_5 ; CHECK-NEXT: .LBB45_2: # %vector.ph -; CHECK-NEXT: addi a3, a2, -1 -; CHECK-NEXT: andi a4, a3, 1024 -; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: addi a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a3 +; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: vsetvli t0, zero, e32, m2, ta, ma ; CHECK-NEXT: .LBB45_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vrem.vx v8, v8, a1 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a2 ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB45_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB45_7 ; CHECK-NEXT: .LBB45_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB45_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2857,9 +2857,9 @@ define void @sink_splat_min(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB46_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmin.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB46_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2891,9 +2891,9 @@ define void @sink_splat_min_commute(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB47_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmin.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB47_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2927,9 +2927,9 @@ define void @sink_splat_max(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB48_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmax.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB48_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2961,9 +2961,9 @@ define void @sink_splat_max_commute(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB49_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmax.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB49_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -2997,9 +2997,9 @@ define void @sink_splat_umin(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB50_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vminu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB50_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -3031,9 +3031,9 @@ define void @sink_splat_umin_commute(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB51_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vminu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB51_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -3067,9 +3067,9 @@ define void @sink_splat_umax(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB52_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmaxu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB52_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -3101,9 +3101,9 @@ define void @sink_splat_umax_commute(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB53_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vmaxu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB53_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -3207,9 +3207,9 @@ define void @sink_splat_ssub_sat(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB56_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vssub.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB56_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -3313,9 +3313,9 @@ define void @sink_splat_usub_sat(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB59_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: vssubu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a2, .LBB59_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -4068,11 +4068,11 @@ define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a1, a3, .LBB79_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -4108,11 +4108,11 @@ define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture r ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a1, a3, .LBB80_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -5284,11 +5284,11 @@ define void @sink_splat_vp_ssub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> % ; CHECK-NEXT: .LBB113_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, 4 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a3, a3, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a3, .LBB113_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -5396,11 +5396,11 @@ define void @sink_splat_vp_usub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> % ; CHECK-NEXT: .LBB116_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, 4 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a3, a3, 4 ; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: bnez a3, .LBB116_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll index b526822975c87..26325328e5671 100644 --- a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll @@ -12,68 +12,68 @@ define i32 @splat_vector_split_i64() { ; CHECK-NEXT: vmv.v.i v10, 3 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: lui a1, 1044480 +; CHECK-NEXT: li a2, 56 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: lui a4, 16 +; CHECK-NEXT: lui a0, 4080 +; CHECK-NEXT: addi a5, sp, 8 +; CHECK-NEXT: sw a1, 8(sp) +; CHECK-NEXT: sw zero, 12(sp) +; CHECK-NEXT: addi a1, a4, -256 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 3 -; CHECK-NEXT: lui a0, 1044480 -; CHECK-NEXT: sw a0, 8(sp) -; CHECK-NEXT: sw zero, 12(sp) -; CHECK-NEXT: li a0, 56 +; CHECK-NEXT: vsetvli a4, zero, e64, m2, ta, ma +; CHECK-NEXT: vlse64.v v10, (a5), zero +; CHECK-NEXT: vsrl.vx v12, v8, a2 +; CHECK-NEXT: vsrl.vx v14, v8, a3 +; CHECK-NEXT: vsrl.vi v16, v8, 24 +; CHECK-NEXT: vsll.vx v18, v8, a2 +; CHECK-NEXT: vand.vx v14, v14, a1 +; CHECK-NEXT: vor.vv v14, v14, v12 +; CHECK-NEXT: vand.vx v12, v8, a1 +; CHECK-NEXT: vsll.vx v12, v12, a3 +; CHECK-NEXT: vor.vv v12, v18, v12 +; CHECK-NEXT: vsrl.vi v18, v8, 8 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vand.vv v18, v18, v10 +; CHECK-NEXT: vor.vv v16, v18, v16 +; CHECK-NEXT: lui a1, 61681 +; CHECK-NEXT: lui a2, 209715 +; CHECK-NEXT: lui a3, 349525 +; CHECK-NEXT: addi a1, a1, -241 +; CHECK-NEXT: addi a2, a2, 819 +; CHECK-NEXT: addi a3, a3, 1365 +; CHECK-NEXT: vor.vv v14, v16, v14 +; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; CHECK-NEXT: vsrl.vx v10, v8, a0 -; CHECK-NEXT: li a1, 40 -; CHECK-NEXT: vsrl.vx v12, v8, a1 -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addi a2, a2, -256 -; CHECK-NEXT: vand.vx v12, v12, a2 -; CHECK-NEXT: vor.vv v10, v12, v10 -; CHECK-NEXT: vsrl.vi v12, v8, 24 -; CHECK-NEXT: addi a3, sp, 8 -; CHECK-NEXT: vlse64.v v14, (a3), zero -; CHECK-NEXT: lui a3, 4080 -; CHECK-NEXT: vand.vx v12, v12, a3 -; CHECK-NEXT: vsrl.vi v16, v8, 8 -; CHECK-NEXT: vand.vv v16, v16, v14 -; CHECK-NEXT: vor.vv v12, v16, v12 -; CHECK-NEXT: vor.vv v10, v12, v10 -; CHECK-NEXT: vand.vv v12, v8, v14 -; CHECK-NEXT: vsll.vi v12, v12, 8 -; CHECK-NEXT: vand.vx v14, v8, a3 -; CHECK-NEXT: vsll.vi v14, v14, 24 -; CHECK-NEXT: vor.vv v12, v14, v12 -; CHECK-NEXT: vsll.vx v14, v8, a0 -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vsll.vx v8, v8, a1 -; CHECK-NEXT: vor.vv v8, v14, v8 -; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vand.vv v10, v8, v10 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: lui a0, 61681 -; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a2 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vand.vv v10, v10, v12 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: lui a0, 209715 -; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a3 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vand.vv v10, v10, v12 -; CHECK-NEXT: vand.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v8, v14 +; CHECK-NEXT: vsrl.vi v14, v8, 4 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vand.vv v14, v14, v16 +; CHECK-NEXT: vsll.vi v8, v8, 4 +; CHECK-NEXT: vor.vv v8, v14, v8 +; CHECK-NEXT: vsrl.vi v14, v8, 2 +; CHECK-NEXT: vand.vv v8, v8, v10 +; CHECK-NEXT: vand.vv v10, v14, v10 ; CHECK-NEXT: vsll.vi v8, v8, 2 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: lui a0, 349525 -; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vand.vv v10, v10, v12 ; CHECK-NEXT: vand.vv v8, v8, v12 +; CHECK-NEXT: vand.vv v10, v10, v12 ; CHECK-NEXT: vadd.vv v8, v8, v8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vslidedown.vi v8, v8, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll index 9d0234d2ec2fb..371055704c090 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll @@ -12,8 +12,8 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: vsra.vv v9, v10, v9 ; CHECK-NEXT: vmsne.vv v8, v8, v9 ; CHECK-NEXT: vmv.v.x v9, a1 @@ -32,14 +32,14 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsll.vv v10, v8, v9 ; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vsll.vv v11, v8, v9 -; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 ; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 ; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp @@ -51,14 +51,14 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsll.vv v10, v8, v9 ; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vsll.vv v11, v8, v9 -; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 ; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 ; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %tmp @@ -70,14 +70,14 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vsll.vv v11, v8, v9 -; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 ; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 ; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %tmp @@ -94,8 +94,8 @@ define @vec_nxv2i64( %x, ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: vsra.vv v14, v12, v10 ; CHECK-NEXT: vmsne.vv v10, v8, v14 ; CHECK-NEXT: vmv.v.x v8, a1 @@ -114,14 +114,14 @@ define @vec_nxv4i32( %x, ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vsll.vv v14, v8, v10 -; CHECK-NEXT: vsra.vv v16, v14, v10 -; CHECK-NEXT: vmsne.vv v10, v8, v16 -; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %tmp = call @llvm.sshl.sat.nxv4i32( %x, %y) ret %tmp @@ -133,14 +133,14 @@ define @vec_nxv8i16( %x, ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: addi a1, a0, -1 -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vsll.vv v14, v8, v10 -; CHECK-NEXT: vsra.vv v16, v14, v10 -; CHECK-NEXT: vmsne.vv v10, v8, v16 -; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %tmp = call @llvm.sshl.sat.nxv8i16( %x, %y) ret %tmp @@ -152,14 +152,14 @@ define @vec_nxv16i8( %x, ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vsll.vv v14, v8, v10 -; CHECK-NEXT: vsra.vv v16, v14, v10 -; CHECK-NEXT: vmsne.vv v10, v8, v16 -; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %tmp = call @llvm.sshl.sat.nxv16i8( %x, %y) ret %tmp diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll index 8515b2566cd0a..62339130678d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll @@ -577,9 +577,9 @@ define @add_stepvector_nxv16i64() { ; RV64-LABEL: add_stepvector_nxv16i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vid.v v8 +; RV64-NEXT: slli a0, a0, 1 ; RV64-NEXT: vadd.vv v8, v8, v8 ; RV64-NEXT: vadd.vx v16, v8, a0 ; RV64-NEXT: ret @@ -616,10 +616,10 @@ define @mul_stepvector_nxv16i64() { ; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64-NEXT: vid.v v8 ; RV64-NEXT: li a0, 3 +; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: slli a0, a1, 1 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vadd.vx v16, v8, a0 ; RV64-NEXT: ret entry: @@ -635,26 +635,26 @@ define @mul_bigimm_stepvector_nxv16i64() { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: li a0, 7 ; RV32-NEXT: lui a1, 797989 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: lui a3, 11557 +; RV32-NEXT: lui a4, 92455 ; RV32-NEXT: addi a1, a1, -683 +; RV32-NEXT: addi a3, a3, -683 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: lui a1, 11557 -; RV32-NEXT: addi a1, a1, -683 -; RV32-NEXT: mul a1, a0, a1 -; RV32-NEXT: srli a0, a0, 3 -; RV32-NEXT: lui a2, 92455 -; RV32-NEXT: addi a2, a2, -1368 -; RV32-NEXT: mulhu a2, a0, a2 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: addi a1, a4, -1368 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: mulhu a1, a0, a1 ; RV32-NEXT: slli a3, a0, 1 ; RV32-NEXT: slli a0, a0, 6 ; RV32-NEXT: sub a0, a0, a3 -; RV32-NEXT: add a0, a2, a0 -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: sw a2, 0(sp) ; RV32-NEXT: sw a0, 4(sp) -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a1), zero ; RV32-NEXT: mv a0, sp ; RV32-NEXT: vlse64.v v16, (a0), zero ; RV32-NEXT: vid.v v24 @@ -668,12 +668,12 @@ define @mul_bigimm_stepvector_nxv16i64() { ; RV64: # %bb.0: # %entry ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: lui a1, 1987 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vid.v v8 ; RV64-NEXT: addiw a1, a1, -731 ; RV64-NEXT: slli a1, a1, 12 ; RV64-NEXT: addi a1, a1, -683 ; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma -; RV64-NEXT: vid.v v8 ; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: vadd.vx v16, v8, a0 ; RV64-NEXT: ret @@ -705,9 +705,9 @@ define @shl_stepvector_nxv16i64() { ; RV64-LABEL: shl_stepvector_nxv16i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vid.v v8 +; RV64-NEXT: slli a0, a0, 2 ; RV64-NEXT: vsll.vi v8, v8, 2 ; RV64-NEXT: vadd.vx v16, v8, a0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index 0e76518f67e16..f8315de324e42 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -765,9 +765,9 @@ declare @llvm.experimental.vp.strided.load.nxv16f64.p0.i6 define @strided_load_nxv17f64(ptr %ptr, i64 %stride, %mask, i32 zeroext %evl, ptr %hi_ptr) { ; CHECK-RV32-LABEL: strided_load_nxv17f64: ; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vmv1r.v v8, v0 ; CHECK-RV32-NEXT: csrr a2, vlenb ; CHECK-RV32-NEXT: slli a7, a2, 1 -; CHECK-RV32-NEXT: vmv1r.v v8, v0 ; CHECK-RV32-NEXT: mv a6, a3 ; CHECK-RV32-NEXT: bltu a3, a7, .LBB57_2 ; CHECK-RV32-NEXT: # %bb.1: @@ -784,12 +784,12 @@ define @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, %v, ptr %ptr, i32 sig ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t ; CHECK-NEXT: sub a5, a2, a3 -; CHECK-NEXT: sltu a2, a2, a5 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a5 ; CHECK-NEXT: mul a4, a4, a1 ; CHECK-NEXT: srli a3, a3, 3 -; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, ma +; CHECK-NEXT: sltu a2, a2, a5 +; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a5 ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t @@ -582,19 +582,19 @@ define void @strided_store_nxv16f64( %v, ptr %ptr, i32 sig define void @strided_store_nxv16f64_allones_mask( %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) { ; CHECK-LABEL: strided_store_nxv16f64_allones_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bltu a2, a3, .LBB47_2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bltu a2, a4, .LBB47_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB47_2: -; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a0), a1 -; CHECK-NEXT: sub a3, a2, a3 -; CHECK-NEXT: sltu a2, a2, a3 +; CHECK-NEXT: sub a4, a2, a4 +; CHECK-NEXT: mul a3, a3, a1 +; CHECK-NEXT: sltu a2, a2, a4 ; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: mul a3, a4, a1 +; CHECK-NEXT: and a2, a2, a4 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v16, (a0), a1 @@ -609,9 +609,17 @@ declare void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32( %v, ptr %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { ; CHECK-LABEL: strided_store_nxv17f64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a6, a4, 1 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: sub sp, sp, a4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a6, a4, 1 ; CHECK-NEXT: mv a5, a3 ; CHECK-NEXT: bltu a3, a6, .LBB48_2 ; CHECK-NEXT: # %bb.1: @@ -622,33 +630,27 @@ define void @strided_store_nxv17f64( %v, ptr %ptr, i32 sig ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a7, a4 ; CHECK-NEXT: .LBB48_4: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr t0, vlenb -; CHECK-NEXT: slli t0, t0, 3 -; CHECK-NEXT: sub sp, sp, t0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t ; CHECK-NEXT: sub a0, a5, a4 +; CHECK-NEXT: mul a7, a7, a2 +; CHECK-NEXT: srli t0, a4, 3 +; CHECK-NEXT: sub a6, a3, a6 +; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, t0 ; CHECK-NEXT: sltu t0, a5, a0 +; CHECK-NEXT: add a7, a1, a7 +; CHECK-NEXT: sltu a3, a3, a6 ; CHECK-NEXT: addi t0, t0, -1 -; CHECK-NEXT: and t0, t0, a0 -; CHECK-NEXT: mul a0, a7, a2 -; CHECK-NEXT: add a7, a1, a0 -; CHECK-NEXT: srli a0, a4, 3 -; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: sub a0, a3, a6 -; CHECK-NEXT: sltu a3, a3, a0 ; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: and t0, t0, a0 +; CHECK-NEXT: and a0, a3, a6 +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma -; CHECK-NEXT: vsse64.v v16, (a7), a2, v0.t +; CHECK-NEXT: vsse64.v v8, (a7), a2, v0.t ; CHECK-NEXT: bltu a0, a4, .LBB48_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a0, a4 @@ -658,10 +660,8 @@ define void @strided_store_nxv17f64( %v, ptr %ptr, i32 sig ; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: add a1, a1, a3 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-NEXT: vsse64.v v16, (a1), a2, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll index c81e678900ab0..ab13c78da05e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll +++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll @@ -41,34 +41,30 @@ define internal void @SubRegLivenessUndefInPhi(i64 %cond) { ; CHECK-NEXT: # %bb.1: # %Cond1 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vi v10, v8, 1 -; CHECK-NEXT: vadd.vi v12, v8, 3 +; CHECK-NEXT: vadd.vi v12, v8, 1 +; CHECK-NEXT: vadd.vi v10, v8, 3 ; CHECK-NEXT: j .LBB2_3 ; CHECK-NEXT: .LBB2_2: # %Cond2 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vadd.vi v10, v9, 1 +; CHECK-NEXT: vadd.vi v11, v9, 3 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v9, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vadd.vi v11, v9, 1 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v10, a0 ; CHECK-NEXT: vslideup.vx v10, v11, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vadd.vi v9, v9, 3 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v12, v9, a0 ; CHECK-NEXT: .LBB2_3: # %UseSR ; CHECK-NEXT: vl1r.v v14, (zero) ; CHECK-NEXT: vsetivli zero, 4, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v14, v8 -; CHECK-NEXT: vrgatherei16.vv v8, v14, v10 +; CHECK-NEXT: vrgatherei16.vv v15, v14, v8 +; CHECK-NEXT: vrgatherei16.vv v8, v14, v12 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vv v8, v13, v8 +; CHECK-NEXT: vand.vv v8, v15, v8 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v9, v14, v12 +; CHECK-NEXT: vrgatherei16.vv v9, v14, v10 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vand.vv v8, v8, v9 ; CHECK-NEXT: vs1r.v v8, (zero) @@ -120,10 +116,10 @@ define internal void @SubRegLivenessUndef() { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1r.v v14, (zero) ; CHECK-NEXT: vsetivli zero, 4, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v14, v8 +; CHECK-NEXT: vrgatherei16.vv v15, v14, v8 ; CHECK-NEXT: vrgatherei16.vv v9, v14, v10 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vv v9, v13, v9 +; CHECK-NEXT: vand.vv v9, v15, v9 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v14, v12 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll index 52c2cace185f7..0bd82e654e021 100644 --- a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll @@ -9,11 +9,11 @@ define @test_urem_vec_even_divisor_eq0( %x) ; RV32-NEXT: addi a0, a0, -1365 ; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: lui a0, 3 +; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vsll.vi v9, v8, 15 ; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 @@ -25,11 +25,11 @@ define @test_urem_vec_even_divisor_eq0( %x) ; RV64-NEXT: addi a0, a0, -1365 ; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: lui a0, 3 +; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vsll.vi v9, v8, 15 ; RV64-NEXT: vsrl.vi v8, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 @@ -81,11 +81,11 @@ define @test_urem_vec_even_divisor_eq1( %x) ; RV32-NEXT: lui a0, 1048571 ; RV32-NEXT: addi a0, a0, -1365 ; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: lui a0, 3 +; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vsll.vi v9, v8, 15 ; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 @@ -99,11 +99,11 @@ define @test_urem_vec_even_divisor_eq1( %x) ; RV64-NEXT: lui a0, 1048571 ; RV64-NEXT: addi a0, a0, -1365 ; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: lui a0, 3 +; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vsll.vi v9, v8, 15 ; RV64-NEXT: vsrl.vi v8, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll index a77208404911c..ebd550013ec78 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1346,9 +1346,9 @@ define @vadd_vi_nxv32i32( %va, @vadd_vi_nxv32i32_evl_nx8( %va, @vdiv_vi_nxv1i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-V-NEXT: vlse64.v v9, (a0), zero -; RV32-V-NEXT: vmulh.vv v8, v8, v9 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v8, v8, v9 ; RV32-V-NEXT: vsrl.vx v9, v8, a0 ; RV32-V-NEXT: vsra.vi v8, v8, 1 ; RV32-V-NEXT: vadd.vv v8, v8, v9 @@ -878,16 +878,16 @@ define @vdiv_vi_nxv2i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32-V-NEXT: vlse64.v v10, (a0), zero -; RV32-V-NEXT: vmulh.vv v8, v8, v10 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v8, v8, v10 ; RV32-V-NEXT: vsrl.vx v10, v8, a0 ; RV32-V-NEXT: vsra.vi v8, v8, 1 ; RV32-V-NEXT: vadd.vv v8, v8, v10 @@ -959,16 +959,16 @@ define @vdiv_vi_nxv4i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32-V-NEXT: vlse64.v v12, (a0), zero -; RV32-V-NEXT: vmulh.vv v8, v8, v12 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v8, v8, v12 ; RV32-V-NEXT: vsrl.vx v12, v8, a0 ; RV32-V-NEXT: vsra.vi v8, v8, 1 ; RV32-V-NEXT: vadd.vv v8, v8, v12 @@ -1040,16 +1040,16 @@ define @vdiv_vi_nxv8i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-V-NEXT: vlse64.v v16, (a0), zero -; RV32-V-NEXT: vmulh.vv v8, v8, v16 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v8, v8, v16 ; RV32-V-NEXT: vsrl.vx v16, v8, a0 ; RV32-V-NEXT: vsra.vi v8, v8, 1 ; RV32-V-NEXT: vadd.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll index 470d61aa96a23..c7b5200979370 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll @@ -11,10 +11,10 @@ define @vdiv_vx_nxv8i7( %a, i7 signext %b, @vdivu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vdivu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vdivu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll index b19995a5aba9a..8e3cedfbeeb03 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll @@ -14,24 +14,24 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV32-NEXT: lw a0, 0(a0) ; RV32-NEXT: srli a2, a0, 16 ; RV32-NEXT: slli a3, a0, 16 -; RV32-NEXT: srli a4, a3, 24 +; RV32-NEXT: slli a4, a0, 24 +; RV32-NEXT: slli a5, a0, 8 +; RV32-NEXT: srli a6, a3, 24 ; RV32-NEXT: srai a3, a3, 24 -; RV32-NEXT: slli a5, a0, 24 +; RV32-NEXT: srai a4, a4, 24 ; RV32-NEXT: srai a5, a5, 24 -; RV32-NEXT: slli a6, a0, 8 -; RV32-NEXT: srai a6, a6, 24 -; RV32-NEXT: sgtz a6, a6 ; RV32-NEXT: sgtz a5, a5 +; RV32-NEXT: sgtz a4, a4 ; RV32-NEXT: sgtz a3, a3 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a3, a3, a4 -; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: neg a4, a5 +; RV32-NEXT: neg a4, a4 +; RV32-NEXT: neg a5, a5 +; RV32-NEXT: and a3, a3, a6 ; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: and a2, a5, a2 +; RV32-NEXT: slli a3, a3, 8 ; RV32-NEXT: andi a0, a0, 255 ; RV32-NEXT: or a0, a0, a3 -; RV32-NEXT: neg a3, a6 -; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: sh a0, 0(a1) ; RV32-NEXT: sb a2, 2(a1) ; RV32-NEXT: ret @@ -41,24 +41,24 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV64-NEXT: lw a0, 0(a0) ; RV64-NEXT: srliw a2, a0, 16 ; RV64-NEXT: slli a3, a0, 48 -; RV64-NEXT: srli a4, a3, 56 +; RV64-NEXT: slli a4, a0, 56 +; RV64-NEXT: slli a5, a0, 40 +; RV64-NEXT: srli a6, a3, 56 ; RV64-NEXT: srai a3, a3, 56 -; RV64-NEXT: slli a5, a0, 56 +; RV64-NEXT: srai a4, a4, 56 ; RV64-NEXT: srai a5, a5, 56 -; RV64-NEXT: slli a6, a0, 40 -; RV64-NEXT: srai a6, a6, 56 -; RV64-NEXT: sgtz a6, a6 ; RV64-NEXT: sgtz a5, a5 +; RV64-NEXT: sgtz a4, a4 ; RV64-NEXT: sgtz a3, a3 ; RV64-NEXT: negw a3, a3 -; RV64-NEXT: and a3, a3, a4 -; RV64-NEXT: slli a3, a3, 8 -; RV64-NEXT: negw a4, a5 +; RV64-NEXT: negw a4, a4 +; RV64-NEXT: negw a5, a5 +; RV64-NEXT: and a3, a3, a6 ; RV64-NEXT: and a0, a4, a0 +; RV64-NEXT: and a2, a5, a2 +; RV64-NEXT: slli a3, a3, 8 ; RV64-NEXT: andi a0, a0, 255 ; RV64-NEXT: or a0, a0, a3 -; RV64-NEXT: negw a3, a6 -; RV64-NEXT: and a2, a3, a2 ; RV64-NEXT: sh a0, 0(a1) ; RV64-NEXT: sb a2, 2(a1) ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 075e463e41a6b..54d2f3f68989b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -11,25 +11,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vmerge.vim v10, v9, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v12, v9, 1, v0 -; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vadd.vv v11, v9, v9 ; CHECK-NEXT: li a0, -256 +; CHECK-NEXT: vmerge.vim v11, v10, 1, v0 +; CHECK-NEXT: vadd.vv v12, v9, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vadd.vi v9, v11, -16 -; CHECK-NEXT: vrgather.vv v8, v10, v9, v0.t -; CHECK-NEXT: vmsne.vi v9, v8, 0 -; CHECK-NEXT: vnsrl.wi v8, v12, 8 -; CHECK-NEXT: vadd.vi v11, v11, -15 -; CHECK-NEXT: vrgather.vv v8, v10, v11, v0.t +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vadd.vi v8, v12, -16 +; CHECK-NEXT: vadd.vi v12, v12, -15 +; CHECK-NEXT: vnsrl.wi v10, v14, 0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v10, v11, v8, v0.t +; CHECK-NEXT: vnsrl.wi v8, v14, 8 +; CHECK-NEXT: vmsne.vi v10, v10, 0 +; CHECK-NEXT: vrgather.vv v8, v11, v12, v0.t ; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) ret {<16 x i1>, <16 x i1>} %retval @@ -95,16 +96,17 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vadd.vv v14, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v15, v14, -4 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v15, v0.t +; CHECK-NEXT: vadd.vi v10, v14, -4 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v12, v16, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vadd.vi v15, v14, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma @@ -122,27 +124,28 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v8i64_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v16, v8 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v7, v8, v8 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v16, v7 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vmv.v.i v0, -16 -; CHECK-NEXT: vadd.vi v12, v7, -8 ; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; CHECK-NEXT: vslidedown.vi v24, v8, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vadd.vv v20, v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v16, v8, v20 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vadd.vi v12, v20, -8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v20, v7, 1 +; CHECK-NEXT: vadd.vi v21, v20, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v16, v20 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v21 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v16, v7, -7 +; CHECK-NEXT: vadd.vi v8, v20, -7 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v24, v16, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret %retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec) ret {<8 x i64>, <8 x i64>} %retval @@ -239,16 +242,17 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vadd.vv v14, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v15, v14, -4 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v15, v0.t +; CHECK-NEXT: vadd.vi v10, v14, -4 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v12, v16, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vadd.vi v15, v14, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index f4c7f0f13e984..39a1bfcda3d83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -21,8 +21,8 @@ define {, } @vector_deinterleave_load_nxv16i ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 ; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: ret %vec = load , ptr %p @@ -106,81 +106,96 @@ define {, } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vadd.vv v24, v8, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v0, (a1) +; CHECK-NEXT: vadd.vi v8, v24, 1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v16, v8, v8 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrgather.vv v8, v16, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v24, v8, v16 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vv v8, v0, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v8, v16, 1 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v0, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v24, v0, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v12, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v28, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v8 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v28, v16 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 201cfaa931b41..f20a90a422313 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -20,8 +20,8 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 ; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: ret %retval = call {, } @llvm.vector.deinterleave2.nxv32i1( %vec) @@ -109,40 +109,20 @@ declare {, } @llvm.vector.deinterleave2.nxv4 define {, } @vector_deinterleave_nxv64i1_nxv128i1( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v16, 0 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v16, 0 +; CHECK-NEXT: vnsrl.wi v0, v16, 8 ; CHECK-NEXT: vnsrl.wi v12, v24, 0 +; CHECK-NEXT: vnsrl.wi v4, v24, 8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vnsrl.wi v16, v8, 8 -; CHECK-NEXT: vnsrl.wi v20, v24, 8 -; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v8, v16, 0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v0, 0 +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret %retval = call {, } @llvm.vector.deinterleave2.nxv128i1( %vec) ret {, } %retval @@ -154,8 +134,8 @@ define {, } @vector_deinterleave_nxv64i8_nxv ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v0, v24, 8 +; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v4, v16, 8 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret @@ -169,8 +149,8 @@ define {, } @vector_deinterleave_nxv32i16_ ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v0, v24, 16 +; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v4, v16, 16 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret @@ -410,8 +390,8 @@ define {, } @vector_deinterleave_nxv ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v0, v24, 16 +; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v4, v16, 16 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret @@ -425,8 +405,8 @@ define {, } @vector_deinterleave_nxv32f1 ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v24, 0 -; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v0, v24, 16 +; CHECK-NEXT: vnsrl.wi v12, v16, 0 ; CHECK-NEXT: vnsrl.wi v4, v16, 16 ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index e730ae230d5a0..7b0ac01918b9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -9,9 +9,9 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { ; CHECK-LABEL: vector_interleave_v32i1_v16i1: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v0, v8, 2 -; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 @@ -27,9 +27,9 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { ; ; ZVBB-LABEL: vector_interleave_v32i1_v16i1: ; ZVBB: # %bb.0: +; ZVBB-NEXT: li a0, 32 ; ZVBB-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVBB-NEXT: vslideup.vi v0, v8, 2 -; ZVBB-NEXT: li a0, 32 ; ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; ZVBB-NEXT: vmv.v.i v8, 0 ; ZVBB-NEXT: vmerge.vim v8, v8, 1, v0 @@ -92,10 +92,10 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vector_interleave_v4i64_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: lui a0, 12304 ; CHECK-NEXT: addi a0, a0, 512 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v10 @@ -107,10 +107,10 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ZVBB-LABEL: vector_interleave_v4i64_v2i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vmv1r.v v10, v9 -; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: lui a0, 12304 ; ZVBB-NEXT: addi a0, a0, 512 +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: vmv.s.x v10, a0 ; ZVBB-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVBB-NEXT: vsext.vf2 v12, v10 @@ -240,10 +240,10 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ; CHECK-LABEL: vector_interleave_v4f64_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: lui a0, 12304 ; CHECK-NEXT: addi a0, a0, 512 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v10 @@ -255,10 +255,10 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ; ZVBB-LABEL: vector_interleave_v4f64_v2f64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vmv1r.v v10, v9 -; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: lui a0, 12304 ; ZVBB-NEXT: addi a0, a0, 512 +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: vmv.s.x v10, a0 ; ZVBB-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVBB-NEXT: vsext.vf2 v12, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 16ce25f86462e..bc203e215d878 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -10,22 +10,22 @@ define void @vector_interleave_store_nxv32i1_nxv16i1( %a, %a, %a, %b, ptr %p) { ; CHECK-LABEL: vector_interleave_store_nxv16i64_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vid.v v6 +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, mu -; CHECK-NEXT: vid.v v24 -; CHECK-NEXT: vand.vi v26, v24, 1 -; CHECK-NEXT: vmsne.vi v28, v26, 0 -; CHECK-NEXT: vsrl.vi v24, v24, 1 -; CHECK-NEXT: vmv1r.v v0, v28 -; CHECK-NEXT: vadd.vx v24, v24, a2, v0.t -; CHECK-NEXT: vmv4r.v v12, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v0, v8, v24 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v28, v16 ; CHECK-NEXT: vmv4r.v v16, v12 -; CHECK-NEXT: vrgatherei16.vv v8, v16, v24 +; CHECK-NEXT: vsrl.vi v8, v6, 1 +; CHECK-NEXT: vand.vi v10, v6, 1 ; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vmsne.vi v0, v10, 0 ; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: vs8r.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vadd.vx v8, v8, a2, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vrgatherei16.vv v0, v24, v8 +; CHECK-NEXT: vrgatherei16.vv v24, v16, v8 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: ret %res = call @llvm.vector.interleave2.nxv16i64( %a, %b) store %res, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 4d14d0013236f..26e9afcb1d109 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -12,42 +12,42 @@ define @vector_interleave_nxv32i1_nxv16i1( ; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: vwaddu.vv v16, v8, v12 -; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vwmaccu.vx v16, a0, v12 ; CHECK-NEXT: vmsne.vi v8, v18, 0 ; CHECK-NEXT: vmsne.vi v0, v16, 0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v0, v8, a0 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v8, a1 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1: ; ZVBB: # %bb.0: ; ZVBB-NEXT: vmv1r.v v9, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, mu ; ZVBB-NEXT: vmv.v.i v10, 0 -; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: li a0, 1 +; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: vmerge.vim v10, v10, 1, v0 +; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: vwsll.vi v12, v10, 8 -; ZVBB-NEXT: li a0, 1 ; ZVBB-NEXT: vmv1r.v v0, v9 ; ZVBB-NEXT: vwaddu.wx v12, v12, a0, v0.t ; ZVBB-NEXT: vmsne.vi v8, v14, 0 ; ZVBB-NEXT: vmsne.vi v0, v12, 0 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: add a1, a0, a0 -; ZVBB-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vx v0, v8, a0 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v8, a1 ; ZVBB-NEXT: ret %res = call @llvm.vector.interleave2.nxv32i1( %a, %b) ret %res @@ -121,9 +121,9 @@ define @vector_interleave_nxv4i64_nxv2i64( ; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vand.vi v13, v12, 1 ; CHECK-NEXT: vmsne.vi v0, v13, 0 ; CHECK-NEXT: vsrl.vi v16, v12, 1 @@ -136,9 +136,9 @@ define @vector_interleave_nxv4i64_nxv2i64( ; ZVBB-LABEL: vector_interleave_nxv4i64_nxv2i64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 2 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu ; ZVBB-NEXT: vid.v v12 +; ZVBB-NEXT: srli a0, a0, 2 ; ZVBB-NEXT: vand.vi v13, v12, 1 ; ZVBB-NEXT: vmsne.vi v0, v13, 0 ; ZVBB-NEXT: vsrl.vi v16, v12, 1 @@ -161,23 +161,22 @@ define @vector_interleave_nxv128i1_nxv64i1( @vector_interleave_nxv128i1_nxv64i1( @llvm.vector.interleave2.nxv128i1( %a, %b) ret %res @@ -209,8 +207,8 @@ define @vector_interleave_nxv128i8_nxv64i8( @vector_interleave_nxv128i8_nxv64i8( @vector_interleave_nxv64i16_nxv32i16( @vector_interleave_nxv64i16_nxv32i16( @vector_interleave_nxv32i32_nxv16i32( @vector_interleave_nxv32i32_nxv16i32( @vector_interleave_nxv32i32_nxv16i32( @vector_interleave_nxv16i64_nxv8i64( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv16i64_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv8r.v v0, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; CHECK-NEXT: vid.v v24 -; CHECK-NEXT: vand.vi v26, v24, 1 -; CHECK-NEXT: vmsne.vi v10, v26, 0 -; CHECK-NEXT: vsrl.vi v8, v24, 1 -; CHECK-NEXT: vmv8r.v v24, v0 -; CHECK-NEXT: vmv4r.v v12, v4 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vid.v v6 +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vmv4r.v v28, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v0, v24, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv4r.v v16, v12 -; CHECK-NEXT: vrgatherei16.vv v24, v16, v8 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vand.vi v8, v6, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vsrl.vi v6, v6, 1 +; CHECK-NEXT: vadd.vx v6, v6, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vrgatherei16.vv v8, v24, v6 +; CHECK-NEXT: vrgatherei16.vv v24, v16, v6 ; CHECK-NEXT: vmv.v.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv16i64_nxv8i64: ; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 3 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; ZVBB-NEXT: vmv8r.v v0, v8 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 1 ; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; ZVBB-NEXT: vid.v v24 -; ZVBB-NEXT: vand.vi v26, v24, 1 -; ZVBB-NEXT: vmsne.vi v10, v26, 0 -; ZVBB-NEXT: vsrl.vi v8, v24, 1 -; ZVBB-NEXT: vmv8r.v v24, v0 -; ZVBB-NEXT: vmv4r.v v12, v4 -; ZVBB-NEXT: vmv1r.v v0, v10 -; ZVBB-NEXT: vadd.vx v8, v8, a0, v0.t +; ZVBB-NEXT: vid.v v6 +; ZVBB-NEXT: vmv8r.v v24, v8 +; ZVBB-NEXT: srli a0, a0, 1 ; ZVBB-NEXT: vmv4r.v v28, v16 -; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; ZVBB-NEXT: vrgatherei16.vv v0, v24, v8 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; ZVBB-NEXT: vmv4r.v v16, v12 -; ZVBB-NEXT: vrgatherei16.vv v24, v16, v8 -; ZVBB-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVBB-NEXT: vand.vi v8, v6, 1 +; ZVBB-NEXT: vmsne.vi v0, v8, 0 +; ZVBB-NEXT: vsrl.vi v6, v6, 1 +; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t +; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6 +; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6 ; ZVBB-NEXT: vmv.v.v v16, v24 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 3 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret %res = call @llvm.vector.interleave2.nxv16i64( %a, %b) ret %res @@ -376,14 +338,14 @@ define @vector_interleave_nxv4bf16_nxv2bf16( @vector_interleave_nxv4bf16_nxv2bf16( @vector_interleave_nxv4f16_nxv2f16( @vector_interleave_nxv4f16_nxv2f16( @vector_interleave_nxv4f64_nxv2f64( @vector_interleave_nxv4f64_nxv2f64( @vector_interleave_nxv64bf16_nxv32bf16( @vector_interleave_nxv64bf16_nxv32bf16( @vector_interleave_nxv64f16_nxv32f16( @vector_interleave_nxv64f16_nxv32f16( @vector_interleave_nxv32f32_nxv16f32( @vector_interleave_nxv32f32_nxv16f32( @vector_interleave_nxv32f32_nxv16f32( @vector_interleave_nxv16f64_nxv8f64( %a, %b) { ; CHECK-LABEL: vector_interleave_nxv16f64_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv8r.v v0, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; CHECK-NEXT: vid.v v24 -; CHECK-NEXT: vand.vi v26, v24, 1 -; CHECK-NEXT: vmsne.vi v10, v26, 0 -; CHECK-NEXT: vsrl.vi v8, v24, 1 -; CHECK-NEXT: vmv8r.v v24, v0 -; CHECK-NEXT: vmv4r.v v12, v4 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vid.v v6 +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vmv4r.v v28, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v0, v24, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv4r.v v16, v12 -; CHECK-NEXT: vrgatherei16.vv v24, v16, v8 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vand.vi v8, v6, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vsrl.vi v6, v6, 1 +; CHECK-NEXT: vadd.vx v6, v6, a0, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vrgatherei16.vv v8, v24, v6 +; CHECK-NEXT: vrgatherei16.vv v24, v16, v6 ; CHECK-NEXT: vmv.v.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64: ; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 3 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; ZVBB-NEXT: vmv8r.v v0, v8 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 1 ; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; ZVBB-NEXT: vid.v v24 -; ZVBB-NEXT: vand.vi v26, v24, 1 -; ZVBB-NEXT: vmsne.vi v10, v26, 0 -; ZVBB-NEXT: vsrl.vi v8, v24, 1 -; ZVBB-NEXT: vmv8r.v v24, v0 -; ZVBB-NEXT: vmv4r.v v12, v4 -; ZVBB-NEXT: vmv1r.v v0, v10 -; ZVBB-NEXT: vadd.vx v8, v8, a0, v0.t +; ZVBB-NEXT: vid.v v6 +; ZVBB-NEXT: vmv8r.v v24, v8 +; ZVBB-NEXT: srli a0, a0, 1 ; ZVBB-NEXT: vmv4r.v v28, v16 -; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; ZVBB-NEXT: vrgatherei16.vv v0, v24, v8 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; ZVBB-NEXT: vmv4r.v v16, v12 -; ZVBB-NEXT: vrgatherei16.vv v24, v16, v8 -; ZVBB-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVBB-NEXT: vand.vi v8, v6, 1 +; ZVBB-NEXT: vmsne.vi v0, v8, 0 +; ZVBB-NEXT: vsrl.vi v6, v6, 1 +; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t +; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6 +; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6 ; ZVBB-NEXT: vmv.v.v v16, v24 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 3 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret %res = call @llvm.vector.interleave2.nxv16f64( %a, %b) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index 5460caea196cf..6a72043ca7e8e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -12,18 +12,18 @@ define @splice_nxv1i1_offset_negone( %a, @llvm.vector.splice.nxv1i1( %a, %b, i32 -1) @@ -34,20 +34,20 @@ define @splice_nxv1i1_offset_max( %a, @llvm.vector.splice.nxv1i1( %a, %b, i32 1) @@ -60,18 +60,18 @@ define @splice_nxv2i1_offset_negone( %a, @llvm.vector.splice.nxv2i1( %a, %b, i32 -1) @@ -82,20 +82,20 @@ define @splice_nxv2i1_offset_max( %a, @llvm.vector.splice.nxv2i1( %a, %b, i32 3) @@ -108,18 +108,18 @@ define @splice_nxv4i1_offset_negone( %a, @llvm.vector.splice.nxv4i1( %a, %b, i32 -1) @@ -130,20 +130,20 @@ define @splice_nxv4i1_offset_max( %a, @llvm.vector.splice.nxv4i1( %a, %b, i32 7) @@ -156,17 +156,17 @@ define @splice_nxv8i1_offset_negone( %a, @llvm.vector.splice.nxv8i1( %a, %b, i32 -1) @@ -177,19 +177,19 @@ define @splice_nxv8i1_offset_max( %a, @llvm.vector.splice.nxv8i1( %a, %b, i32 15) @@ -202,13 +202,13 @@ define @splice_nxv16i1_offset_negone( %a, < ; CHECK-LABEL: splice_nxv16i1_offset_negone: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 -; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vslidedown.vx v8, v8, a0 @@ -224,13 +224,13 @@ define @splice_nxv16i1_offset_max( %a, @splice_nxv32i1_offset_negone( %a, < ; CHECK-LABEL: splice_nxv32i1_offset_negone: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 -; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vslidedown.vx v8, v8, a0 @@ -273,11 +273,11 @@ define @splice_nxv32i1_offset_max( %a, @splice_nxv64i1_offset_negone( %a, < ; CHECK-LABEL: splice_nxv64i1_offset_negone: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 -; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vslidedown.vx v8, v8, a0 @@ -320,11 +320,11 @@ define @splice_nxv64i1_offset_max( %a, @vfabs_vv_nxv16f64( %va, @vfadd_vf_nxv16bf16( %va, bf define @vfadd_vv_nxv32bf16( %va, %vb) strictfp { ; CHECK-LABEL: vfadd_vv_nxv32bf16: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfadd.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret entry: %vc = call @llvm.experimental.constrained.fadd.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -212,22 +227,39 @@ entry: define @vfadd_vf_nxv32bf16( %va, bfloat %b) strictfp { ; CHECK-LABEL: vfadd_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v24, v24, v0 +; CHECK-NEXT: vfadd.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vfadd.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -485,19 +517,34 @@ define @vfadd_vv_nxv32f16( %va, @llvm.experimental.constrained.fadd.nxv32f16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -513,22 +560,39 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfadd_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfadd.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll index 53a13b511a799..19c5ee4a85ed6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll @@ -204,19 +204,34 @@ define @vfadd_vf_nxv16bf16( %va, bf define @vfadd_vv_nxv32bf16( %va, %vb) { ; CHECK-LABEL: vfadd_vv_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfadd.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vc = fadd %va, %vb ret %vc @@ -225,22 +240,39 @@ define @vfadd_vv_nxv32bf16( %va, @vfadd_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfadd_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v24, v24, v0 +; CHECK-NEXT: vfadd.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v16, v24, v0 +; CHECK-NEXT: vfadd.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -512,19 +544,34 @@ define @vfadd_vv_nxv32f16( %va, %va, %vb ret %vc @@ -539,22 +586,39 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfadd_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfadd.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll index 31762a7d840ec..1953cfd2a0169 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll @@ -413,20 +413,21 @@ define @vfadd_vv_nxv32bf16( %va, @vfadd_vv_nxv32bf16_unmasked( @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 @@ -540,18 +543,18 @@ define @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a3, a4, a3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vfadd.vv v16, v8, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 @@ -603,31 +606,37 @@ define @vfadd_vf_nxv32bf16_unmasked( @vfadd_vv_nxv32f16( %va, @vfadd_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1313,23 +1324,24 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v8 +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 3 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a3, a1, 3 +; ZVFHMIN-NEXT: add a1, a3, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -1337,18 +1349,18 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 3 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a4, a3, 3 +; ZVFHMIN-NEXT: add a3, a4, a3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfadd.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1406,31 +1418,37 @@ define @vfadd_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vmv4r.v v16, v8 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB51_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll index 21c5f757e4558..ec6ab422d6405 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll @@ -57,8 +57,8 @@ define @fcmp_ogt_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -73,8 +73,8 @@ define @fcmp_ogt_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -102,8 +102,8 @@ define @fcmp_oge_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -118,8 +118,8 @@ define @fcmp_oge_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -147,8 +147,8 @@ define @fcmp_olt_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -163,8 +163,8 @@ define @fcmp_olt_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -192,8 +192,8 @@ define @fcmp_ole_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -208,8 +208,8 @@ define @fcmp_ole_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -240,8 +240,8 @@ define @fcmp_one_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -259,8 +259,8 @@ define @fcmp_one_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -336,8 +336,8 @@ define @fcmp_ueq_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -355,8 +355,8 @@ define @fcmp_ueq_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -388,8 +388,8 @@ define @fcmp_ugt_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -405,8 +405,8 @@ define @fcmp_ugt_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -436,8 +436,8 @@ define @fcmp_uge_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -453,8 +453,8 @@ define @fcmp_uge_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -484,8 +484,8 @@ define @fcmp_ult_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -501,8 +501,8 @@ define @fcmp_ult_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -532,8 +532,8 @@ define @fcmp_ule_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -549,8 +549,8 @@ define @fcmp_ule_fv_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -690,8 +690,8 @@ define @fcmp_ogt_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -706,8 +706,8 @@ define @fcmp_ogt_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -735,8 +735,8 @@ define @fcmp_oge_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -751,8 +751,8 @@ define @fcmp_oge_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -780,8 +780,8 @@ define @fcmp_olt_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -796,8 +796,8 @@ define @fcmp_olt_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -825,8 +825,8 @@ define @fcmp_ole_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -841,8 +841,8 @@ define @fcmp_ole_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -873,8 +873,8 @@ define @fcmp_one_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -892,8 +892,8 @@ define @fcmp_one_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -969,8 +969,8 @@ define @fcmp_ueq_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -988,8 +988,8 @@ define @fcmp_ueq_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1021,8 +1021,8 @@ define @fcmp_ugt_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1038,8 +1038,8 @@ define @fcmp_ugt_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1069,8 +1069,8 @@ define @fcmp_uge_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1086,8 +1086,8 @@ define @fcmp_uge_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1117,8 +1117,8 @@ define @fcmp_ult_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1134,8 +1134,8 @@ define @fcmp_ult_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1165,8 +1165,8 @@ define @fcmp_ule_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1182,8 +1182,8 @@ define @fcmp_ule_fv_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1323,8 +1323,8 @@ define @fcmp_ogt_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1339,8 +1339,8 @@ define @fcmp_ogt_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1368,8 +1368,8 @@ define @fcmp_oge_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1384,8 +1384,8 @@ define @fcmp_oge_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1413,8 +1413,8 @@ define @fcmp_olt_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1429,8 +1429,8 @@ define @fcmp_olt_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1458,8 +1458,8 @@ define @fcmp_ole_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1474,8 +1474,8 @@ define @fcmp_ole_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -1506,8 +1506,8 @@ define @fcmp_one_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -1525,8 +1525,8 @@ define @fcmp_one_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1602,8 +1602,8 @@ define @fcmp_ueq_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -1621,8 +1621,8 @@ define @fcmp_ueq_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -1654,8 +1654,8 @@ define @fcmp_ugt_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1671,8 +1671,8 @@ define @fcmp_ugt_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1702,8 +1702,8 @@ define @fcmp_uge_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1719,8 +1719,8 @@ define @fcmp_uge_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1750,8 +1750,8 @@ define @fcmp_ult_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1767,8 +1767,8 @@ define @fcmp_ult_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1798,8 +1798,8 @@ define @fcmp_ule_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -1815,8 +1815,8 @@ define @fcmp_ule_fv_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -2164,9 +2164,9 @@ define @fcmp_one_vf_nxv8f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -2184,9 +2184,9 @@ define @fcmp_one_fv_nxv8f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -2263,9 +2263,9 @@ define @fcmp_ueq_vf_nxv8f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -2283,9 +2283,9 @@ define @fcmp_ueq_fv_nxv8f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -3981,8 +3981,8 @@ define @fcmp_ogt_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -3997,8 +3997,8 @@ define @fcmp_ogt_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4026,8 +4026,8 @@ define @fcmp_oge_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4042,8 +4042,8 @@ define @fcmp_oge_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4071,8 +4071,8 @@ define @fcmp_olt_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4087,8 +4087,8 @@ define @fcmp_olt_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4116,8 +4116,8 @@ define @fcmp_ole_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4132,8 +4132,8 @@ define @fcmp_ole_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4164,8 +4164,8 @@ define @fcmp_one_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4183,8 +4183,8 @@ define @fcmp_one_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4260,8 +4260,8 @@ define @fcmp_ueq_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4279,8 +4279,8 @@ define @fcmp_ueq_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4312,8 +4312,8 @@ define @fcmp_ugt_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4329,8 +4329,8 @@ define @fcmp_ugt_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4360,8 +4360,8 @@ define @fcmp_uge_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4377,8 +4377,8 @@ define @fcmp_uge_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4408,8 +4408,8 @@ define @fcmp_ult_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4425,8 +4425,8 @@ define @fcmp_ult_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4456,8 +4456,8 @@ define @fcmp_ule_vf_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4473,8 +4473,8 @@ define @fcmp_ule_fv_nxv1f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4614,8 +4614,8 @@ define @fcmp_ogt_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4630,8 +4630,8 @@ define @fcmp_ogt_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4659,8 +4659,8 @@ define @fcmp_oge_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4675,8 +4675,8 @@ define @fcmp_oge_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4704,8 +4704,8 @@ define @fcmp_olt_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4720,8 +4720,8 @@ define @fcmp_olt_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4749,8 +4749,8 @@ define @fcmp_ole_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4765,8 +4765,8 @@ define @fcmp_ole_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -4797,8 +4797,8 @@ define @fcmp_one_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4816,8 +4816,8 @@ define @fcmp_one_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4893,8 +4893,8 @@ define @fcmp_ueq_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -4912,8 +4912,8 @@ define @fcmp_ueq_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -4945,8 +4945,8 @@ define @fcmp_ugt_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4962,8 +4962,8 @@ define @fcmp_ugt_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -4993,8 +4993,8 @@ define @fcmp_uge_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5010,8 +5010,8 @@ define @fcmp_uge_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5041,8 +5041,8 @@ define @fcmp_ult_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5058,8 +5058,8 @@ define @fcmp_ult_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5089,8 +5089,8 @@ define @fcmp_ule_vf_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5106,8 +5106,8 @@ define @fcmp_ule_fv_nxv2f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -5455,9 +5455,9 @@ define @fcmp_one_vf_nxv4f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -5475,9 +5475,9 @@ define @fcmp_one_fv_nxv4f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -5554,9 +5554,9 @@ define @fcmp_ueq_vf_nxv4f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -5574,9 +5574,9 @@ define @fcmp_ueq_fv_nxv4f32( %va, float %b ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -7272,8 +7272,8 @@ define @fcmp_ogt_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7288,8 +7288,8 @@ define @fcmp_ogt_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7317,8 +7317,8 @@ define @fcmp_oge_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7333,8 +7333,8 @@ define @fcmp_oge_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7362,8 +7362,8 @@ define @fcmp_olt_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7378,8 +7378,8 @@ define @fcmp_olt_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7407,8 +7407,8 @@ define @fcmp_ole_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7423,8 +7423,8 @@ define @fcmp_ole_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: ret @@ -7455,8 +7455,8 @@ define @fcmp_one_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -7474,8 +7474,8 @@ define @fcmp_one_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -7551,8 +7551,8 @@ define @fcmp_ueq_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t @@ -7570,8 +7570,8 @@ define @fcmp_ueq_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t @@ -7603,8 +7603,8 @@ define @fcmp_ugt_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7620,8 +7620,8 @@ define @fcmp_ugt_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7651,8 +7651,8 @@ define @fcmp_uge_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7668,8 +7668,8 @@ define @fcmp_uge_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7699,8 +7699,8 @@ define @fcmp_ult_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7716,8 +7716,8 @@ define @fcmp_ult_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7747,8 +7747,8 @@ define @fcmp_ule_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -7764,8 +7764,8 @@ define @fcmp_ule_fv_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmfeq.vv v10, v8, v8 +; CHECK-NEXT: vmfeq.vf v9, v9, fa0 ; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t ; CHECK-NEXT: vmnot.m v0, v0 @@ -8113,9 +8113,9 @@ define @fcmp_one_vf_nxv2f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -8133,9 +8133,9 @@ define @fcmp_one_fv_nxv2f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t @@ -8212,9 +8212,9 @@ define @fcmp_ueq_vf_nxv2f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v12, v13 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t @@ -8232,9 +8232,9 @@ define @fcmp_ueq_fv_nxv2f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfeq.vf v12, v10, fa0 -; CHECK-NEXT: vmfeq.vv v10, v8, v8 -; CHECK-NEXT: vmand.mm v10, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8 +; CHECK-NEXT: vmfeq.vf v13, v10, fa0 +; CHECK-NEXT: vmand.mm v10, v13, v12 ; CHECK-NEXT: vmv1r.v v11, v10 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll index 56284d90a146b..2ca9dd24e915a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll @@ -509,8 +509,8 @@ define @fcmps_uno_vf_nxv1f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -1041,8 +1041,8 @@ define @fcmps_uno_vf_nxv2f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -1573,8 +1573,8 @@ define @fcmps_uno_vf_nxv4f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -2105,10 +2105,10 @@ define @fcmps_uno_vf_nxv8f16( %va, half %b) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -2637,10 +2637,10 @@ define @fcmps_uno_vf_nxv16f16( %va, half ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -3169,10 +3169,10 @@ define @fcmps_uno_vf_nxv32f16( %va, half ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vmfle.vf v24, v16, fa0 -; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmnot.m v8, v16 -; CHECK-NEXT: vmorn.mm v0, v8, v24 +; CHECK-NEXT: vmfle.vv v24, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v16, fa0 +; CHECK-NEXT: vmnot.m v9, v24 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -3701,8 +3701,8 @@ define @fcmps_uno_vf_nxv1f32( %va, float % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -4233,8 +4233,8 @@ define @fcmps_uno_vf_nxv2f32( %va, float % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -4765,10 +4765,10 @@ define @fcmps_uno_vf_nxv4f32( %va, float % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, float %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -5297,10 +5297,10 @@ define @fcmps_uno_vf_nxv8f32( %va, float % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, float %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -5829,10 +5829,10 @@ define @fcmps_uno_vf_nxv16f32( %va, floa ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vmfle.vf v24, v16, fa0 -; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmnot.m v8, v16 -; CHECK-NEXT: vmorn.mm v0, v8, v24 +; CHECK-NEXT: vmfle.vv v24, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v16, fa0 +; CHECK-NEXT: vmnot.m v9, v24 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, float %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -6361,8 +6361,8 @@ define @fcmps_uno_vf_nxv1f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmorn.mm v0, v8, v9 ; CHECK-NEXT: ret @@ -6893,10 +6893,10 @@ define @fcmps_uno_vf_nxv2f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, double %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -7425,10 +7425,10 @@ define @fcmps_uno_vf_nxv4f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, double %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -7957,10 +7957,10 @@ define @fcmps_uno_vf_nxv8f64( %va, double ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vmfle.vf v24, v16, fa0 -; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmnot.m v8, v16 -; CHECK-NEXT: vmorn.mm v0, v8, v24 +; CHECK-NEXT: vmfle.vv v24, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v16, fa0 +; CHECK-NEXT: vmnot.m v9, v24 +; CHECK-NEXT: vmorn.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement poison, double %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll index b28981ff196ad..beb56a2645a1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll @@ -128,12 +128,12 @@ define @vfcopysign_vf_nxv1f16( %vm, half ; ZVFHMIN-LABEL: vfcopysign_vf_nxv1f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -174,13 +174,13 @@ define @vfcopynsign_vf_nxv1f16( %vm, half ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv1f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v9, v9, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -227,9 +227,9 @@ define @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32( @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32( @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32( @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64( @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64( @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64( @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64( @vfcopysign_vf_nxv2f16( %vm, half ; ZVFHMIN-LABEL: vfcopysign_vf_nxv2f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -502,13 +500,13 @@ define @vfcopynsign_vf_nxv2f16( %vm, half ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv2f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v9, v9, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -550,12 +548,12 @@ define @vfcopysign_vf_nxv4f16( %vm, half ; ZVFHMIN-LABEL: vfcopysign_vf_nxv4f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -596,13 +594,13 @@ define @vfcopynsign_vf_nxv4f16( %vm, half ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv4f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v9, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v9, v9, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -644,12 +642,12 @@ define @vfcopysign_vf_nxv8f16( %vm, half ; ZVFHMIN-LABEL: vfcopysign_vf_nxv8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v10, v10, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v10, v10, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -690,13 +688,13 @@ define @vfcopynsign_vf_nxv8f16( %vm, half ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v10, v10, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v10, v10, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v10, v10, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -743,9 +741,9 @@ define @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32( @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32( @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32( @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64( @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64( @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64( @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64( @vfcopysign_vf_nxv16f16( %vm, ha ; ZVFHMIN-LABEL: vfcopysign_vf_nxv16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v12, v12, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -1018,13 +1014,13 @@ define @vfcopynsign_vf_nxv16f16( %vm, h ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v12, v12, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v12, v12, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v12, v12, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v12, v12, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -1066,12 +1062,12 @@ define @vfcopysign_vf_nxv32f16( %vm, ha ; ZVFHMIN-LABEL: vfcopysign_vf_nxv32f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v16, v16, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v16, v16, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 @@ -1112,13 +1108,13 @@ define @vfcopynsign_vf_nxv32f16( %vm, h ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv32f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a0 -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vxor.vx v16, v16, a0 -; ZVFHMIN-NEXT: addi a1, a0, -1 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v16, v16, a0 +; ZVFHMIN-NEXT: addi a0, a1, -1 +; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: vand.vx v16, v16, a1 ; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %s, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index ab517de846b0f..07750623dd44b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -209,19 +209,34 @@ define @vfdiv_vf_nxv16bf16( %va, bf define @vfdiv_vv_nxv32bf16( %va, %vb) strictfp { ; CHECK-LABEL: vfdiv_vv_nxv32bf16: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfdiv.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret entry: %vc = call @llvm.experimental.constrained.fdiv.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -231,22 +246,39 @@ entry: define @vfdiv_vf_nxv32bf16( %va, bfloat %b) strictfp { ; CHECK-LABEL: vfdiv_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v24, v0 +; CHECK-NEXT: vfdiv.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vfdiv.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -529,19 +561,34 @@ define @vfdiv_vv_nxv32f16( %va, @llvm.experimental.constrained.fdiv.nxv32f16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -557,22 +604,39 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index 8d0c3bcf16756..b4a9b1fe3fcf8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -200,19 +200,34 @@ define @vfdiv_vf_nxv16bf16( %va, bf define @vfdiv_vv_nxv32bf16( %va, %vb) { ; CHECK-LABEL: vfdiv_vv_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfdiv.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vc = fdiv %va, %vb ret %vc @@ -221,22 +236,39 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfdiv_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v24, v0 +; CHECK-NEXT: vfdiv.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v24, v0 +; CHECK-NEXT: vfdiv.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -508,19 +540,34 @@ define @vfdiv_vv_nxv32f16( %va, %va, %vb ret %vc @@ -535,22 +582,39 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll index 28e71e845a409..ccd286b7ee5fd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll @@ -375,20 +375,21 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vv_nxv32bf16_unmasked( @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 @@ -502,18 +505,18 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a3, a4, a3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vfdiv.vv v16, v8, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 @@ -565,31 +568,37 @@ define @vfdiv_vf_nxv32bf16_unmasked( @vfdiv_vv_nxv32f16( %va, @vfdiv_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1225,23 +1236,24 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v8 +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 3 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a3, a1, 3 +; ZVFHMIN-NEXT: add a1, a3, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -1249,18 +1261,18 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 3 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a4, a3, 3 +; ZVFHMIN-NEXT: add a3, a4, a3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1318,31 +1330,37 @@ define @vfdiv_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vmv4r.v v16, v8 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfdiv.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll index 3107d4e044cae..abfb652f2206e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll @@ -6,35 +6,35 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 signext %len, i32 signext %n) { ; CHECK-LABEL: compare_bytes_simple: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addiw a4, a2, 1 -; CHECK-NEXT: bltu a3, a4, .LBB0_7 +; CHECK-NEXT: addiw a5, a2, 1 +; CHECK-NEXT: bltu a3, a5, .LBB0_7 ; CHECK-NEXT: # %bb.1: # %mismatch_mem_check -; CHECK-NEXT: slli a2, a4, 32 +; CHECK-NEXT: slli a2, a5, 32 +; CHECK-NEXT: slli a4, a3, 32 ; CHECK-NEXT: srli a2, a2, 32 -; CHECK-NEXT: slli a5, a3, 32 -; CHECK-NEXT: srli a5, a5, 32 +; CHECK-NEXT: srli a4, a4, 32 ; CHECK-NEXT: add a6, a0, a2 -; CHECK-NEXT: add a7, a0, a5 +; CHECK-NEXT: add a7, a0, a4 ; CHECK-NEXT: srli a6, a6, 12 ; CHECK-NEXT: srli a7, a7, 12 ; CHECK-NEXT: bne a6, a7, .LBB0_7 ; CHECK-NEXT: # %bb.2: # %mismatch_mem_check ; CHECK-NEXT: add a6, a1, a2 -; CHECK-NEXT: add a7, a1, a5 +; CHECK-NEXT: add a7, a1, a4 ; CHECK-NEXT: srli a6, a6, 12 ; CHECK-NEXT: srli a7, a7, 12 ; CHECK-NEXT: bne a6, a7, .LBB0_7 ; CHECK-NEXT: .LBB0_3: # %mismatch_vec_loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sub a4, a5, a2 -; CHECK-NEXT: vsetvli a4, a4, e8, m2, ta, ma +; CHECK-NEXT: sub a5, a4, a2 ; CHECK-NEXT: add a6, a0, a2 +; CHECK-NEXT: add a7, a1, a2 +; CHECK-NEXT: vsetvli a5, a5, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v8, (a6) -; CHECK-NEXT: add a6, a1, a2 -; CHECK-NEXT: vle8.v v10, (a6) +; CHECK-NEXT: vle8.v v10, (a7) ; CHECK-NEXT: vmsne.vv v12, v8, v10 ; CHECK-NEXT: vfirst.m a7, v12 -; CHECK-NEXT: mv a6, a4 +; CHECK-NEXT: mv a6, a5 ; CHECK-NEXT: bltz a7, .LBB0_5 ; CHECK-NEXT: # %bb.4: # %mismatch_vec_loop ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 @@ -42,30 +42,30 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 signext %len, i32 signext % ; CHECK-NEXT: .LBB0_5: # %mismatch_vec_loop ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: sext.w a7, a6 -; CHECK-NEXT: bne a7, a4, .LBB0_11 +; CHECK-NEXT: bne a7, a5, .LBB0_11 ; CHECK-NEXT: # %bb.6: # %mismatch_vec_loop_inc ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: bne a2, a5, .LBB0_3 +; CHECK-NEXT: add a2, a2, a5 +; CHECK-NEXT: bne a2, a4, .LBB0_3 ; CHECK-NEXT: j .LBB0_9 ; CHECK-NEXT: .LBB0_7: # %mismatch_loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: slli a2, a4, 32 +; CHECK-NEXT: slli a2, a5, 32 ; CHECK-NEXT: srli a2, a2, 32 -; CHECK-NEXT: add a5, a0, a2 -; CHECK-NEXT: lbu a5, 0(a5) +; CHECK-NEXT: add a4, a0, a2 ; CHECK-NEXT: add a2, a1, a2 +; CHECK-NEXT: lbu a4, 0(a4) ; CHECK-NEXT: lbu a2, 0(a2) -; CHECK-NEXT: bne a5, a2, .LBB0_10 +; CHECK-NEXT: bne a4, a2, .LBB0_10 ; CHECK-NEXT: # %bb.8: # %mismatch_loop_inc ; CHECK-NEXT: # in Loop: Header=BB0_7 Depth=1 -; CHECK-NEXT: addiw a4, a4, 1 -; CHECK-NEXT: bne a3, a4, .LBB0_7 +; CHECK-NEXT: addiw a5, a5, 1 +; CHECK-NEXT: bne a3, a5, .LBB0_7 ; CHECK-NEXT: .LBB0_9: # %while.end ; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_10: -; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: mv a0, a5 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_11: # %mismatch_vec_loop_found ; CHECK-NEXT: slli a6, a6, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 83f59f973d465..fd518d9be786d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -51,12 +51,12 @@ define @vfma_vf_nxv1bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t +; CHECK-NEXT: vfmadd.vv v12, v11, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: ret @@ -71,14 +71,14 @@ define @vfma_vf_nxv1bf16_commute( %va ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t +; CHECK-NEXT: vfmadd.vv v11, v8, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v11 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -91,12 +91,12 @@ define @vfma_vf_nxv1bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vfmadd.vv v12, v11, v10 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: ret @@ -111,12 +111,12 @@ define @vfma_vf_nxv1bf16_unmasked_commute( @vfma_vf_nxv2bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t +; CHECK-NEXT: vfmadd.vv v12, v11, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: ret @@ -185,14 +185,14 @@ define @vfma_vf_nxv2bf16_commute( %va ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t +; CHECK-NEXT: vfmadd.vv v11, v8, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v11 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -205,12 +205,12 @@ define @vfma_vf_nxv2bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vfmadd.vv v12, v11, v10 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: ret @@ -225,12 +225,12 @@ define @vfma_vf_nxv2bf16_unmasked_commute( @vfma_vf_nxv4bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v14, v12, v0.t +; CHECK-NEXT: vfmadd.vv v14, v12, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -299,14 +299,14 @@ define @vfma_vf_nxv4bf16_commute( %va ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v14, v8, v12, v0.t +; CHECK-NEXT: vfmadd.vv v12, v14, v10, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -319,14 +319,14 @@ define @vfma_vf_nxv4bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v14, v12 +; CHECK-NEXT: vfmadd.vv v14, v12, v10 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -339,14 +339,14 @@ define @vfma_vf_nxv4bf16_unmasked_commute( poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -393,14 +393,14 @@ define @vfma_vf_nxv8bf16( %va, bfloat ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 -; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v20, v16, v0.t +; CHECK-NEXT: vfmadd.vv v20, v16, v12, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -413,14 +413,14 @@ define @vfma_vf_nxv8bf16_commute( %va ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 -; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfmadd.vv v20, v8, v16, v0.t +; CHECK-NEXT: vfmadd.vv v16, v20, v12, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -433,14 +433,14 @@ define @vfma_vf_nxv8bf16_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 -; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vfmadd.vv v20, v16, v12 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -453,14 +453,14 @@ define @vfma_vf_nxv8bf16_unmasked_commute( poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -528,12 +528,13 @@ define @vfma_vf_nxv16bf16( %va, bfl ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v4, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv.v.x v12, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t @@ -557,14 +558,14 @@ define @vfma_vf_nxv16bf16_commute( ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 ; CHECK-NEXT: vmv.v.x v4, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t +; CHECK-NEXT: vfmadd.vv v24, v8, v16, v0.t ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -575,31 +576,16 @@ define @vfma_vf_nxv16bf16_commute( define @vfma_vf_nxv16bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v16, a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vfmadd.vv v0, v24, v16 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -610,31 +596,16 @@ define @vfma_vf_nxv16bf16_unmasked( define @vfma_vf_nxv16bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v16, a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vfmadd.vv v0, v24, v16 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement poison, bfloat %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -661,11 +632,8 @@ define @vfma_vv_nxv32bf16( %va, @vfma_vv_nxv32bf16( %va, @vfma_vv_nxv32bf16_unmasked( ; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vmv8r.v v24, v16 -; CHECK-NEXT: vl8re16.v v16, (a0) +; CHECK-NEXT: vl8re16.v v24, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a2, a0 @@ -815,45 +785,47 @@ define @vfma_vv_nxv32bf16_unmasked( ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v7 ; CHECK-NEXT: slli a0, a2, 1 -; CHECK-NEXT: sub a3, a1, a0 -; CHECK-NEXT: sltu a4, a1, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v7 +; CHECK-NEXT: sub a3, a1, a0 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v7, a2 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: sltu a2, a1, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: add a3, a3, a4 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 @@ -914,58 +886,55 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb -; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: fmv.x.h a2, fa0 +; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 5 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: slli a4, a1, 5 +; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 -; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: slli a1, a3, 1 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: sub a4, a0, a1 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 4 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs1r.v v0, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: sltu a3, a0, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a4, a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a2 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a4, a2, 4 ; CHECK-NEXT: add a2, a4, a2 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a4, a2, 5 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 @@ -983,11 +952,8 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -998,13 +964,6 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 4 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a2, a1 @@ -1012,12 +971,19 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a2, a1, 5 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 ; CHECK-NEXT: csrr a1, vlenb @@ -1031,11 +997,8 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -1076,58 +1039,55 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb -; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: fmv.x.h a2, fa0 +; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 5 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: slli a4, a1, 5 +; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 -; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: slli a1, a3, 1 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: sub a4, a0, a1 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 4 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs1r.v v0, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: sltu a3, a0, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a4, a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a2 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a4, a2, 4 ; CHECK-NEXT: add a2, a4, a2 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a4, a2, 5 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 @@ -1145,11 +1105,8 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB33_2: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -1160,25 +1117,25 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 4 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: slli a2, a1, 4 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -1193,11 +1150,8 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -1234,50 +1188,46 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: fmv.x.h a2, fa0 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v7 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 -; CHECK-NEXT: srli a2, a2, 2 -; CHECK-NEXT: vmset.m v7 -; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v7, a2 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: slli a1, a3, 1 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: sub a4, a0, a1 +; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a3 +; CHECK-NEXT: sltu a3, a0, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a2 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -1290,7 +1240,10 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -1298,16 +1251,13 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -1341,50 +1291,46 @@ define @vfma_vf_nxv32bf16_unmasked_commute( @vfma_vf_nxv32bf16_unmasked_commute( @vfma_vf_nxv32bf16_unmasked_commute( @vfma_vf_nxv1f16( %va, half %b, @vfma_vf_nxv1f16_commute( %va, hal ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v11, v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1548,12 +1494,12 @@ define @vfma_vf_nxv1f16_unmasked( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -1574,12 +1520,12 @@ define @vfma_vf_nxv1f16_unmasked_commute( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -1647,12 +1593,12 @@ define @vfma_vf_nxv2f16( %va, half %b, @vfma_vf_nxv2f16_commute( %va, hal ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v11, v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1699,12 +1645,12 @@ define @vfma_vf_nxv2f16_unmasked( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -1725,12 +1671,12 @@ define @vfma_vf_nxv2f16_unmasked_commute( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -1798,14 +1744,14 @@ define @vfma_vf_nxv4f16( %va, half %b, poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1824,14 +1770,14 @@ define @vfma_vf_nxv4f16_commute( %va, hal ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v14, v8, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v14, v10, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1850,14 +1796,14 @@ define @vfma_vf_nxv4f16_unmasked( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v14, v12, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1876,14 +1822,14 @@ define @vfma_vf_nxv4f16_unmasked_commute( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v14, v12, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1949,14 +1895,14 @@ define @vfma_vf_nxv8f16( %va, half %b, poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1975,14 +1921,14 @@ define @vfma_vf_nxv8f16_commute( %va, hal ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v20, v8, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v20, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2001,14 +1947,14 @@ define @vfma_vf_nxv8f16_unmasked( %va, ha ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 +; ZVFHMIN-NEXT: vfmadd.vv v20, v16, v12 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2027,14 +1973,14 @@ define @vfma_vf_nxv8f16_unmasked_commute( ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v12, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 +; ZVFHMIN-NEXT: vfmadd.vv v20, v16, v12 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2121,12 +2067,13 @@ define @vfma_vf_nxv16f16( %va, half %b, ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t @@ -2156,14 +2103,14 @@ define @vfma_vf_nxv16f16_commute( %va, ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vmv.v.x v4, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2180,31 +2127,16 @@ define @vfma_vf_nxv16f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 2 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a1 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24 +; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 2 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2221,31 +2153,16 @@ define @vfma_vf_nxv16f16_unmasked_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2280,11 +2197,8 @@ define @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: slli a2, a2, 5 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v16 -; ZVFHMIN-NEXT: vl8re16.v v16, (a0) +; ZVFHMIN-NEXT: vl8re16.v v24, (a0) ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a2, a0 @@ -2441,45 +2357,47 @@ define @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: slli a0, a2, 1 -; ZVFHMIN-NEXT: sub a3, a1, a0 -; ZVFHMIN-NEXT: sltu a4, a1, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: sub a3, a1, a0 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sltu a2, a1, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv4r.v v8, v16 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -2546,58 +2464,55 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 5 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a4, a1, 5 +; ZVFHMIN-NEXT: add a1, a4, a1 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a0, a1 +; ZVFHMIN-NEXT: csrr a5, vlenb +; ZVFHMIN-NEXT: slli a5, a5, 4 +; ZVFHMIN-NEXT: add a5, sp, a5 +; ZVFHMIN-NEXT: addi a5, a5, 16 +; ZVFHMIN-NEXT: vs1r.v v0, (a5) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a4 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a5, a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a4, a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: vmv4r.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a4, a2, 4 ; ZVFHMIN-NEXT: add a2, a4, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 5 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 @@ -2615,11 +2530,8 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB68_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a2, a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -2630,13 +2542,6 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 4 -; ZVFHMIN-NEXT: add a1, a2, a1 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: mv a2, a1 ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a2, a2, a1 @@ -2644,12 +2549,19 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a2, a1, 5 ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a2, a1, 4 +; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb @@ -2663,11 +2575,8 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a2, a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -2714,58 +2623,55 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 5 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a4, a1, 5 +; ZVFHMIN-NEXT: add a1, a4, a1 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a0, a1 +; ZVFHMIN-NEXT: csrr a5, vlenb +; ZVFHMIN-NEXT: slli a5, a5, 4 +; ZVFHMIN-NEXT: add a5, sp, a5 +; ZVFHMIN-NEXT: addi a5, a5, 16 +; ZVFHMIN-NEXT: vs1r.v v0, (a5) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a4 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a5, a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a4, a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: vmv4r.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a4, a2, 4 ; ZVFHMIN-NEXT: add a2, a4, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 5 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 @@ -2783,11 +2689,8 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB69_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a2, a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -2798,25 +2701,25 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 4 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: slli a2, a1, 4 ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a2, a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -2831,11 +2734,8 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a2, a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 5 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -2878,50 +2778,46 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: mv a4, a1 ; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, a1, a4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a0, a1 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a4 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: vmv4r.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -2934,7 +2830,10 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB70_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -2942,16 +2841,13 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -2991,50 +2887,46 @@ define @vfma_vf_nxv32f16_unmasked_commute( @vfma_vf_nxv32f16_unmasked_commute( @vfma_vf_nxv32f16_unmasked_commute( @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: vl8re64.v v8, (a5) +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: vl8re64.v v16, (a3) -; CHECK-NEXT: sub a3, a4, a1 -; CHECK-NEXT: sltu a5, a4, a3 +; CHECK-NEXT: vl8re64.v v24, (a3) +; CHECK-NEXT: sltu a3, a4, a5 ; CHECK-NEXT: vl8re64.v v8, (a2) ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a3, a5, a3 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a5 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v8, v24 +; CHECK-NEXT: vfmadd.vv v24, v16, v8 ; CHECK-NEXT: bltu a4, a1, .LBB129_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB129_2: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: vfmadd.vv v0, v16, v8 ; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -3960,14 +3878,15 @@ define @vfmsub_vv_nxv1f16( %va, @vfmsub_vv_nxv1f16_unmasked( %va, ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v10, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -4015,14 +3935,14 @@ define @vfmsub_vf_nxv1f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -4046,16 +3966,16 @@ define @vfmsub_vf_nxv1f16_commute( %va, h ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v11, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4077,14 +3997,14 @@ define @vfmsub_vf_nxv1f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -4108,14 +4028,14 @@ define @vfmsub_vf_nxv1f16_unmasked_commute( @vfnmadd_vf_nxv1f16_neg_splat_unmasked( @vfnmadd_vf_nxv1f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv1f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4660,16 +4580,16 @@ define @vfnmsub_vf_nxv1f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v11, v9, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4691,16 +4611,16 @@ define @vfnmsub_vf_nxv1f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4722,16 +4642,16 @@ define @vfnmsub_vf_nxv1f16_unmasked_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4753,16 +4673,16 @@ define @vfnmsub_vf_nxv1f16_neg_splat( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v9, v10, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v11, v9, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v10, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4784,14 +4704,14 @@ define @vfnmsub_vf_nxv1f16_neg_splat_commute( @vfnmsub_vf_nxv1f16_neg_splat_unmasked( @vfnmsub_vf_nxv1f16_neg_splat_unmasked_commute( @vfmsub_vv_nxv2f16( %va, @vfmsub_vv_nxv2f16_unmasked( %va, ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v10, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -4932,14 +4854,14 @@ define @vfmsub_vf_nxv2f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -4963,16 +4885,16 @@ define @vfmsub_vf_nxv2f16_commute( %va, h ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v11, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -4994,14 +4916,14 @@ define @vfmsub_vf_nxv2f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 +; ZVFHMIN-NEXT: vfmadd.vv v12, v11, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret @@ -5025,14 +4947,14 @@ define @vfmsub_vf_nxv2f16_unmasked_commute( @vfnmadd_vf_nxv2f16_neg_splat_unmasked( @vfnmadd_vf_nxv2f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv2f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5577,16 +5499,16 @@ define @vfnmsub_vf_nxv2f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v11, v9, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5608,16 +5530,16 @@ define @vfnmsub_vf_nxv2f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 +; ZVFHMIN-NEXT: vfmadd.vv v12, v9, v11 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5639,16 +5561,16 @@ define @vfnmsub_vf_nxv2f16_unmasked_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5670,16 +5592,16 @@ define @vfnmsub_vf_nxv2f16_neg_splat( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v9, v10, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v11, v9, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v10, v9, v11, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v11 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5701,14 +5623,14 @@ define @vfnmsub_vf_nxv2f16_neg_splat_commute( @vfnmsub_vf_nxv2f16_neg_splat_unmasked( @vfnmsub_vf_nxv2f16_neg_splat_unmasked_commute( @vfmsub_vv_nxv4f16( %va, @vfmsub_vv_nxv4f16_unmasked( %va, ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v14, v10, v12 +; ZVFHMIN-NEXT: vfmadd.vv v14, v12, v10 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret @@ -5849,14 +5773,14 @@ define @vfmsub_vf_nxv4f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v12, v14, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -5880,16 +5804,16 @@ define @vfmsub_vf_nxv4f16_commute( %va, h ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v14, v8, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v8, v14, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -5911,14 +5835,14 @@ define @vfmsub_vf_nxv4f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v9, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v16, v12, v14 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -5942,14 +5866,14 @@ define @vfmsub_vf_nxv4f16_unmasked_commute( @vfnmadd_vf_nxv4f16_neg_splat_unmasked( @vfnmadd_vf_nxv4f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv4f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v12, v14, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -6494,16 +6418,16 @@ define @vfnmsub_vf_nxv4f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v8, v14, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v14, v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -6525,14 +6449,14 @@ define @vfnmsub_vf_nxv4f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v12, v14 +; ZVFHMIN-NEXT: vfmadd.vv v16, v14, v12 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret @@ -6556,14 +6480,14 @@ define @vfnmsub_vf_nxv4f16_unmasked_commute( @vfnmsub_vf_nxv4f16_neg_splat( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v9, v10, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v14, v10, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v10, v14, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -6618,14 +6542,14 @@ define @vfnmsub_vf_nxv4f16_neg_splat_commute( @vfnmsub_vf_nxv4f16_neg_splat_unmasked( @vfnmsub_vf_nxv4f16_neg_splat_unmasked_commute( @vfmsub_vv_nxv8f16( %va, @vfmsub_vv_nxv8f16_unmasked( %va, ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v12, v12, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v12, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v20, v12, v16 +; ZVFHMIN-NEXT: vfmadd.vv v20, v16, v12 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret @@ -6766,14 +6692,14 @@ define @vfmsub_vf_nxv8f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v20, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret @@ -6797,16 +6723,16 @@ define @vfmsub_vf_nxv8f16_commute( %va, h ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v20, v8, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v20, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -6828,14 +6754,14 @@ define @vfmsub_vf_nxv8f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v10, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v20 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret @@ -6859,14 +6785,14 @@ define @vfmsub_vf_nxv8f16_unmasked_commute( @vfnmadd_vf_nxv8f16_neg_splat_unmasked( @vfnmadd_vf_nxv8f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv8f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v20, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret @@ -7411,16 +7337,16 @@ define @vfnmsub_vf_nxv8f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v20, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v20, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -7442,14 +7368,14 @@ define @vfnmsub_vf_nxv8f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v20 +; ZVFHMIN-NEXT: vfmadd.vv v24, v20, v16 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: ret @@ -7473,14 +7399,14 @@ define @vfnmsub_vf_nxv8f16_unmasked_commute( @vfnmsub_vf_nxv8f16_neg_splat( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v12, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vxor.vx v12, v12, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v10, v12, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v20, v12, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v12, v20, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -7535,14 +7461,14 @@ define @vfnmsub_vf_nxv8f16_neg_splat_commute( @vfnmsub_vf_nxv8f16_neg_splat_unmasked( @vfnmsub_vf_nxv8f16_neg_splat_unmasked_commute( @vfmsub_vv_nxv16f16( %va, @llvm.vp.fneg.nxv16f16( %c, %m, i32 %evl) %v = call @llvm.vp.fma.nxv16f16( %va, %b, %negc, %m, i32 %evl) @@ -7656,14 +7597,15 @@ define @vfmsub_vv_nxv16f16_unmasked( %v ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v16, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: ret @@ -7681,21 +7623,36 @@ define @vfmsub_vf_nxv16f16( %va, half % ; ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vmv4r.v v16, v8 +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v12, v12, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v12, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -7717,11 +7674,11 @@ define @vfmsub_vf_nxv16f16_commute( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v12, v12, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v12, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t @@ -7756,16 +7713,16 @@ define @vfmsub_vf_nxv16f16_unmasked( %v ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v12, v12, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v12, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24 +; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb @@ -7803,16 +7760,16 @@ define @vfmsub_vf_nxv16f16_unmasked_commute( @vfnmadd_vf_nxv16f16_neg_splat_unmasked( @vfnmadd_vf_nxv16f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv16f16( %va, half ; ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vmv4r.v v16, v12 +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -8460,16 +8432,16 @@ define @vfnmsub_vf_nxv16f16_commute( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v4, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -8499,16 +8471,16 @@ define @vfnmsub_vf_nxv16f16_unmasked( % ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb @@ -8546,16 +8518,16 @@ define @vfnmsub_vf_nxv16f16_unmasked_commute( @vfnmsub_vf_nxv16f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v12, v16, a1, v0.t ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -8613,22 +8585,36 @@ define @vfnmsub_vf_nxv16f16_neg_splat_commute( poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -8650,14 +8636,14 @@ define @vfnmsub_vf_nxv16f16_neg_splat_unmasked( @vfnmsub_vf_nxv16f16_neg_splat_unmasked_commute( @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v0, v24, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a2, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a2 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a1, a0 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v20, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a1, a4 +; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v0, v24, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a0, a2, 1 -; ZVFHMIN-NEXT: sub a3, a1, a0 -; ZVFHMIN-NEXT: sltu a4, a1, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 @@ -8921,68 +8898,66 @@ define @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: slli a2, a2, 5 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v8, v16 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vmv1r.v v0, v20 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a1, a0, .LBB281_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 ; ZVFHMIN-NEXT: .LBB281_2: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16 +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 +; ZVFHMIN-NEXT: vmv8r.v v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -9010,44 +8985,56 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a3, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v24, v16, a3, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 +; ZVFHMIN-NEXT: vmv4r.v v4, v28 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a2, .LBB282_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 ; ZVFHMIN-NEXT: .LBB282_2: -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vmv4r.v v4, v12 +; ZVFHMIN-NEXT: vmv8r.v v16, v8 +; ZVFHMIN-NEXT: addi a4, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 @@ -9055,43 +9042,50 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 5 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -9100,26 +9094,26 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9147,35 +9141,35 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a3, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t -; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vxor.vx v16, v16, a3, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a2, .LBB283_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 @@ -9192,7 +9186,7 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -9201,47 +9195,46 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 +; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -9278,49 +9271,52 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vxor.vx v16, v24, a1 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a0, a1 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a4 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vmv4r.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a4, a2 @@ -9328,9 +9324,14 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 @@ -9338,12 +9339,12 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -9352,11 +9353,11 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB284_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -9364,20 +9365,23 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v0 +; ZVFHMIN-NEXT: vfmadd.vv v0, v8, v16 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v0 +; ZVFHMIN-NEXT: vmv8r.v v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9402,88 +9406,91 @@ define @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: mv a3, a1 ; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v16, a0, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v24, a0, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t ; ZVFHMIN-NEXT: slli a2, a0, 1 -; ZVFHMIN-NEXT: mv a3, a1 -; ZVFHMIN-NEXT: vmv4r.v v4, v12 +; ZVFHMIN-NEXT: vmv4r.v v4, v20 ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a4, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: mv a5, a4 -; ZVFHMIN-NEXT: slli a4, a4, 1 -; ZVFHMIN-NEXT: add a4, a4, a5 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a2, .LBB287_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB287_2: +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: bltu a1, a2, .LBB287_2 +; ZVFHMIN-NEXT: # %bb.1: +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: .LBB287_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 @@ -9711,18 +9723,21 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: mv a4, a3 @@ -9730,32 +9745,28 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sub a2, a1, a2 +; ZVFHMIN-NEXT: srli a0, a0, 2 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: slli a3, a3, 5 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: sub a2, a1, a2 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: sltu a1, a1, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 ; ZVFHMIN-NEXT: addi a1, a1, -1 ; ZVFHMIN-NEXT: and a1, a1, a2 -; ZVFHMIN-NEXT: srli a0, a0, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a2, a0 @@ -9765,17 +9776,20 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -9805,106 +9819,157 @@ define @vfnmadd_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a3, a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 2 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb +; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a2, a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: lui a2, 8 +; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a4, a0, 5 +; ZVFHMIN-NEXT: add a0, a4, a0 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a1, a0 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v16, v7, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs1r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sltu a3, a1, a4 +; ZVFHMIN-NEXT: csrr a5, vlenb +; ZVFHMIN-NEXT: mv a6, a5 +; ZVFHMIN-NEXT: slli a5, a5, 3 +; ZVFHMIN-NEXT: add a6, a6, a5 +; ZVFHMIN-NEXT: slli a5, a5, 1 +; ZVFHMIN-NEXT: add a5, a5, a6 +; ZVFHMIN-NEXT: add a5, sp, a5 +; ZVFHMIN-NEXT: addi a5, a5, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v0, v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a4, a4, a2 ; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a0 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a4, a2, 4 +; ZVFHMIN-NEXT: add a2, a4, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v24, v24, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a0, a2, 1 -; ZVFHMIN-NEXT: sub a3, a1, a0 -; ZVFHMIN-NEXT: sltu a4, a1, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: slli a4, a2, 5 +; ZVFHMIN-NEXT: add a2, a4, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a4, a2, 4 +; ZVFHMIN-NEXT: add a2, a4, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a3, a2, 5 +; ZVFHMIN-NEXT: add a2, a3, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a3, a2, 5 +; ZVFHMIN-NEXT: add a2, a3, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 ; ZVFHMIN-NEXT: .LBB288_2: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a2, a2, a0 ; ZVFHMIN-NEXT: slli a0, a0, 1 ; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a2, a0, 5 +; ZVFHMIN-NEXT: add a0, a2, a0 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v8, v16 +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a1, a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9929,79 +9994,109 @@ define @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a4, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: mv a3, a0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t -; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vxor.vx v8, v8, a4, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v16, a4, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: mv a3, a0 ; ZVFHMIN-NEXT: addi a4, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma @@ -10086,12 +10187,12 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv4r.v v4, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a2, .LBB290_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 @@ -10141,19 +10242,18 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 5 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -10210,51 +10310,59 @@ define @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a4, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: mv a3, a0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t -; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vxor.vx v8, v8, a4, v0.t +; ZVFHMIN-NEXT: vxor.vx v24, v16, a4, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vmv4r.v v20, v28 +; ZVFHMIN-NEXT: addi a4, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv4r.v v4, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a0, a2, .LBB291_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 ; ZVFHMIN-NEXT: .LBB291_2: ; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 ; ZVFHMIN-NEXT: slli a4, a4, 1 ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -10262,56 +10370,55 @@ define @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 @@ -10347,73 +10454,71 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: lui a2, 8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: mv a4, a1 ; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, a1, a4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a2 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a2 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a2, a3, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 @@ -10474,80 +10579,77 @@ define @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v24, a1, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a4, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 1 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: mv a3, a0 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a4, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v16, a4, v0.t +; ZVFHMIN-NEXT: slli a2, a1, 1 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill @@ -10646,61 +10747,66 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: slli a3, a3, 4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -10728,40 +10834,43 @@ define @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: mv a3, a1 ; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v16, a0, v0.t -; ZVFHMIN-NEXT: vxor.vx v16, v24, a0, v0.t -; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t ; ZVFHMIN-NEXT: slli a2, a0, 1 -; ZVFHMIN-NEXT: mv a3, a1 -; ZVFHMIN-NEXT: vmv4r.v v4, v12 +; ZVFHMIN-NEXT: vmv4r.v v4, v20 ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a4, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 -; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: mv a5, a4 -; ZVFHMIN-NEXT: slli a4, a4, 1 -; ZVFHMIN-NEXT: add a4, a4, a5 -; ZVFHMIN-NEXT: add a4, sp, a4 -; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a1, a2, .LBB299_2 ; ZVFHMIN-NEXT: # %bb.1: @@ -11293,8 +11479,8 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 @@ -11302,18 +11488,21 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: mv a4, a3 @@ -11321,32 +11510,28 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sub a2, a1, a2 +; ZVFHMIN-NEXT: srli a0, a0, 2 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: slli a3, a3, 5 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: sub a2, a1, a2 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: sltu a1, a1, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 ; ZVFHMIN-NEXT: addi a1, a1, -1 ; ZVFHMIN-NEXT: and a1, a1, a2 -; ZVFHMIN-NEXT: srli a0, a0, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a2, a0 @@ -11356,17 +11541,20 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -11396,106 +11584,157 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a3, a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 2 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb +; ZVFHMIN-NEXT: vl8re16.v v24, (a0) +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a2, a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: lui a2, 8 +; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v16, v16, a2 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a4, a0, 5 +; ZVFHMIN-NEXT: add a0, a4, a0 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: slli a0, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a1, a0 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v16, v7, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs1r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sltu a3, a1, a4 +; ZVFHMIN-NEXT: csrr a5, vlenb +; ZVFHMIN-NEXT: mv a6, a5 +; ZVFHMIN-NEXT: slli a5, a5, 3 +; ZVFHMIN-NEXT: add a6, a6, a5 +; ZVFHMIN-NEXT: slli a5, a5, 1 +; ZVFHMIN-NEXT: add a5, a5, a6 +; ZVFHMIN-NEXT: add a5, sp, a5 +; ZVFHMIN-NEXT: addi a5, a5, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v0, v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a4, a4, a2 ; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a3 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vl8re16.v v24, (a0) -; ZVFHMIN-NEXT: lui a0, 8 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v16, v16, a0 +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a4, a2, 4 +; ZVFHMIN-NEXT: add a2, a4, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v24, v24, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a0, a2, 1 -; ZVFHMIN-NEXT: sub a3, a1, a0 -; ZVFHMIN-NEXT: sltu a4, a1, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: slli a4, a2, 5 +; ZVFHMIN-NEXT: add a2, a4, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a4, a2, 4 +; ZVFHMIN-NEXT: add a2, a4, a2 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a3, a2, 5 +; ZVFHMIN-NEXT: add a2, a3, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: slli a3, a2, 5 +; ZVFHMIN-NEXT: add a2, a3, a2 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 ; ZVFHMIN-NEXT: .LBB300_2: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a2, a2, a0 ; ZVFHMIN-NEXT: slli a0, a0, 1 ; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a2, a0, 5 +; ZVFHMIN-NEXT: add a0, a2, a0 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v8, v16 +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v24 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a1, a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -11520,79 +11759,109 @@ define @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a3, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 5 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t -; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vxor.vx v16, v8, a3, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: vmv4r.v v4, v12 -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 4 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB302_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB302_2: +; ZVFHMIN-NEXT: vmv4r.v v12, v20 +; ZVFHMIN-NEXT: addi a4, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 @@ -11683,36 +11951,75 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: bltu a0, a2, .LBB302_2 +; ZVFHMIN-NEXT: # %bb.1: +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: .LBB302_2: +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv4r.v v4, v28 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 5 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -11720,33 +12027,18 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 @@ -11783,46 +12075,42 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a3, 8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a2 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t -; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vxor.vx v8, v8, a3, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: vmv4r.v v4, v12 -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB303_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB303_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB303_2 +; ZVFHMIN-NEXT: # %bb.1: +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: .LBB303_2: +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv4r.v v4, v28 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 @@ -11841,41 +12129,50 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: addi a3, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 @@ -11909,50 +12206,51 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a1 -; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vxor.vx v16, v8, a1 ; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v7 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: slli a1, a3, 1 +; ZVFHMIN-NEXT: srli a3, a3, 2 +; ZVFHMIN-NEXT: sub a4, a0, a1 +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 +; ZVFHMIN-NEXT: sltu a3, a0, a4 +; ZVFHMIN-NEXT: addi a3, a3, -1 +; ZVFHMIN-NEXT: and a3, a3, a4 +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v24, a2 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv4r.v v8, v16 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vmv4r.v v8, v24 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: mv a4, a2 @@ -11960,22 +12258,27 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 4 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8 +; ZVFHMIN-NEXT: vfncvt.f.f.w v28, v8 ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -11984,11 +12287,11 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB304_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 +; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -11996,20 +12299,23 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v8 +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v8 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v24, v0 +; ZVFHMIN-NEXT: vmv8r.v v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -12034,50 +12340,46 @@ define @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v16, v8 -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: lui a3, 8 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a2 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v24, v16, a3, v0.t ; ZVFHMIN-NEXT: slli a2, a1, 1 -; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: slli a3, a3, 3 -; ZVFHMIN-NEXT: add a3, sp, a3 -; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB306_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB306_2: +; ZVFHMIN-NEXT: vmv4r.v v20, v28 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: mv a5, a4 @@ -12202,37 +12507,77 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v0, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB306_2 +; ZVFHMIN-NEXT: # %bb.1: +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: .LBB306_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: addi a4, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t -; ZVFHMIN-NEXT: addi a3, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv4r.v v4, v12 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a3, a3, 3 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 ; ZVFHMIN-NEXT: sub a2, a0, a2 +; ZVFHMIN-NEXT: srli a1, a1, 2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 5 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: sltu a0, a0, a2 +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: addi a0, a0, -1 ; ZVFHMIN-NEXT: and a0, a0, a2 -; ZVFHMIN-NEXT: srli a1, a1, 2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 @@ -12241,28 +12586,23 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 -; ZVFHMIN-NEXT: add a1, sp, a1 -; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -12287,125 +12627,118 @@ define @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfmadd_vf_nxv1bf16( %va, @vfmadd_vf_nxv2bf16( %va, @vfmadd_vf_nxv4bf16( %va, poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -149,14 +149,14 @@ define @vfmadd_vf_nxv8bf16( %va, poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -200,31 +200,16 @@ define @vfmadd_vv_nxv16bf16( %va, < define @vfmadd_vf_nxv16bf16( %va, %vb, bfloat %c) strictfp { ; CHECK-LABEL: vfmadd_vf_nxv16bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vfmadd.vv v0, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -242,31 +227,30 @@ define @vfmadd_vv_nxv32bf16( %va, < ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vl8re16.v v0, (a0) -; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: vmv8r.v v0, v16 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: vl8re16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v16, v8 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 @@ -278,41 +262,31 @@ define @vfmadd_vv_nxv32bf16( %va, < ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmadd.vv v0, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v8, v24 @@ -347,49 +321,50 @@ define @vfmadd_vf_nxv32bf16( %va, < ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v24, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v24 +; CHECK-NEXT: vfmadd.vv v24, v16, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v8, v24 +; CHECK-NEXT: vfmadd.vv v16, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 @@ -440,12 +415,12 @@ define @vfmadd_vf_nxv1f16( %va, @vfmadd_vf_nxv2f16( %va, @vfmadd_vf_nxv4f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -590,14 +565,14 @@ define @vfmadd_vf_nxv8f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -654,31 +629,16 @@ define @vfmadd_vf_nxv16f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -704,31 +664,30 @@ define @vfmadd_vv_nxv32f16( %va, @vfmadd_vv_nxv32f16( %va, @vfmadd_vf_nxv32f16( %va, @vfmadd_vf_nxv1bf16( %va, @vfmadd_vf_nxv2bf16( %va, @vfmadd_vf_nxv4bf16( %va, poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -162,14 +162,14 @@ define @vfmadd_vf_nxv8bf16( %va, poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -198,31 +198,16 @@ define @vfmadd_vv_nxv16bf16( %va, < define @vfmadd_vf_nxv16bf16( %va, %vb, bfloat %c) { ; CHECK-LABEL: vfmadd_vf_nxv16bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vfmadd.vv v0, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -241,18 +226,18 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFH-NEXT: slli a1, a1, 5 ; ZVFH-NEXT: sub sp, sp, a1 ; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFH-NEXT: vl8re16.v v0, (a0) -; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: vmv8r.v v0, v16 +; ZVFH-NEXT: addi a1, sp, 16 +; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v16, v8 +; ZVFH-NEXT: vl8re16.v v8, (a0) ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 4 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vmv8r.v v16, v8 -; ZVFH-NEXT: addi a0, sp, 16 ; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -260,14 +245,13 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vmv8r.v v8, v0 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v8 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 @@ -281,33 +265,28 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v0, v16, v24 -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfmadd.vv v0, v8, v24 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: addi a0, sp, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: slli a0, a0, 4 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v28 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -315,11 +294,6 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFH-NEXT: vfmadd.vv v16, v8, v24 @@ -342,31 +316,30 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; ZVFHMIN-NEXT: vl8re16.v v0, (a0) -; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: vmv8r.v v0, v16 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v16, v8 +; ZVFHMIN-NEXT: vl8re16.v v8, (a0) ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v8, v0 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v8 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 @@ -378,41 +351,31 @@ define @vfmadd_vv_nxv32bf16( %va, < ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfmadd.vv v0, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v28 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24 @@ -436,68 +399,91 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: addi sp, sp, -16 ; ZVFH-NEXT: .cfi_def_cfa_offset 16 ; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 5 +; ZVFH-NEXT: sub sp, sp, a0 +; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFH-NEXT: vmv8r.v v0, v16 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v16, v8 +; ZVFH-NEXT: fmv.x.h a0, fa0 +; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFH-NEXT: csrr a1, vlenb +; ZVFH-NEXT: slli a1, a1, 4 +; ZVFH-NEXT: add a1, sp, a1 +; ZVFH-NEXT: addi a1, a1, 16 +; ZVFH-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0 +; ZVFH-NEXT: csrr a1, vlenb +; ZVFH-NEXT: slli a1, a1, 3 +; ZVFH-NEXT: add a1, sp, a1 +; ZVFH-NEXT: addi a1, a1, 16 +; ZVFH-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH-NEXT: vmv.v.x v24, a0 +; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 ; ZVFH-NEXT: slli a0, a0, 1 ; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: sub sp, sp, a0 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; ZVFH-NEXT: vmv8r.v v24, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: fmv.x.h a0, fa0 -; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFH-NEXT: vmv.v.x v16, a0 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8 -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 4 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v24 -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v0, v16, v24 -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12 -; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v8, v24, v0 +; ZVFH-NEXT: vmv.v.v v24, v8 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 4 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; ZVFH-NEXT: addi a0, sp, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v24, v8, v16 -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0 -; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 ; ZVFH-NEXT: slli a0, a0, 1 ; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v16, v8, v0 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 5 ; ZVFH-NEXT: add sp, sp, a0 ; ZVFH-NEXT: .cfi_def_cfa sp, 16 ; ZVFH-NEXT: addi sp, sp, 16 @@ -509,64 +495,85 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: vmv8r.v v0, v16 +; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v24, a0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v0 +; ZVFHMIN-NEXT: vmv.v.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4 +; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16 +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0 -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v24 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -635,12 +642,12 @@ define @vfmadd_vf_nxv1f16( %va, @vfmadd_vf_nxv2f16( %va, @vfmadd_vf_nxv4f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -785,14 +792,14 @@ define @vfmadd_vf_nxv8f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -833,31 +840,16 @@ define @vfmadd_vf_nxv16f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -883,31 +875,30 @@ define @vfmadd_vv_nxv32f16( %va, @vfmadd_vv_nxv32f16( %va, @vfmadd_vf_nxv32f16( %va, @llvm.maxnum.nxv32bf16(, @vfmax_nxv32bf16_vv( %a, %b) { ; CHECK-LABEL: vfmax_nxv32bf16_vv: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmax.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.maxnum.nxv32bf16( %a, %b) ret %v @@ -216,22 +231,39 @@ define @vfmax_nxv32bf16_vv( %a, @vfmax_nxv32bf16_vf( %a, bfloat %b) { ; CHECK-LABEL: vfmax_nxv32bf16_vf: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v24, v24, v0 +; CHECK-NEXT: vfmax.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v24, v0 +; CHECK-NEXT: vfmax.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -490,19 +522,34 @@ define @vfmax_nxv32f16_vv( %a, @llvm.maxnum.nxv32f16( %a, %b) ret %v @@ -517,22 +564,39 @@ define @vfmax_nxv32f16_vf( %a, half %b) ; ; ZVFHMIN-LABEL: vfmax_nxv32f16_vf: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfmax.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfmax.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll index b8d62b04e5c2c..dafcf8a1410d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll @@ -185,20 +185,21 @@ define @vfmax_vv_nxv32bf16( %va, @vfmax_vv_nxv32bf16_unmasked( @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll index de49aed6e52b2..3ee82c33485f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll @@ -195,19 +195,34 @@ declare @llvm.minnum.nxv32bf16(, @vfmin_nxv32bf16_vv( %a, %b) { ; CHECK-LABEL: vfmin_nxv32bf16_vv: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmin.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.minnum.nxv32bf16( %a, %b) ret %v @@ -216,22 +231,39 @@ define @vfmin_nxv32bf16_vv( %a, @vfmin_nxv32bf16_vf( %a, bfloat %b) { ; CHECK-LABEL: vfmin_nxv32bf16_vf: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v24, v24, v0 +; CHECK-NEXT: vfmin.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v16, v24, v0 +; CHECK-NEXT: vfmin.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -490,19 +522,34 @@ define @vfmin_nxv32f16_vv( %a, @llvm.minnum.nxv32f16( %a, %b) ret %v @@ -517,22 +564,39 @@ define @vfmin_nxv32f16_vf( %a, half %b) ; ; ZVFHMIN-LABEL: vfmin_nxv32f16_vf: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfmin.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfmin.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll index d6848943110c5..b3df6572f7936 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll @@ -185,20 +185,21 @@ define @vfmin_vv_nxv32bf16( %va, @vfmin_vv_nxv32bf16_unmasked( @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll index d082f85106f16..978347fa4fc10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll @@ -24,12 +24,12 @@ define @vfmsub_vv_nxv1f16( %va, @vfmsub_vf_nxv1f16( %va, @vfmsub_vv_nxv2f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv2f16( %va, %vc, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -107,14 +107,14 @@ define @vfmsub_vf_nxv2f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -136,12 +136,12 @@ define @vfmsub_vv_nxv4f16( %va, @vfmsub_vf_nxv4f16( %va, @vfmsub_vv_nxv8f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv8f16( %vb, %vc, %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -219,12 +219,12 @@ define @vfmsub_vf_nxv8f16( %va, @vfmsub_vv_nxv16f16( %va, @vfmsub_vf_nxv16f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @vfmul_vf_nxv16bf16( %va, bf define @vfmul_vv_nxv32bf16( %va, %vb) strictfp { ; CHECK-LABEL: vfmul_vv_nxv32bf16: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmul.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret entry: %vc = call @llvm.experimental.constrained.fmul.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -212,22 +227,39 @@ entry: define @vfmul_vf_nxv32bf16( %va, bfloat %b) strictfp { ; CHECK-LABEL: vfmul_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v24, v24, v0 +; CHECK-NEXT: vfmul.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vfmul.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -485,19 +517,34 @@ define @vfmul_vv_nxv32f16( %va, @llvm.experimental.constrained.fmul.nxv32f16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -513,22 +560,39 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfmul_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfmul.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll index bbacbaa8e5e49..fb8ed3f943fff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll @@ -204,19 +204,34 @@ define @vfmul_vf_nxv16bf16( %va, bf define @vfmul_vv_nxv32bf16( %va, %vb) { ; CHECK-LABEL: vfmul_vv_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmul.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vc = fmul %va, %vb ret %vc @@ -225,22 +240,39 @@ define @vfmul_vv_nxv32bf16( %va, @vfmul_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfmul_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v24, v24, v0 +; CHECK-NEXT: vfmul.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v16, v24, v0 +; CHECK-NEXT: vfmul.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -512,19 +544,34 @@ define @vfmul_vv_nxv32f16( %va, %va, %vb ret %vc @@ -539,22 +586,39 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfmul_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfmul.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll index eacc8676f3483..f4a236df4c9e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll @@ -497,20 +497,21 @@ define @vfmul_vv_nxv32f16( %va, @vfmul_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -612,23 +614,24 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v8 +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 3 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a3, a1, 3 +; ZVFHMIN-NEXT: add a1, a3, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -636,18 +639,18 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 3 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a4, a3, 3 +; ZVFHMIN-NEXT: add a3, a4, a3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -705,31 +708,37 @@ define @vfmul_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vmv4r.v v16, v8 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll index 6f2c5b6e46f86..d1702268f829f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll @@ -1108,10 +1108,10 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: vl8re64.v v8, (a5) +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: vl8re64.v v16, (a3) -; CHECK-NEXT: sub a3, a4, a1 -; CHECK-NEXT: sltu a5, a4, a3 +; CHECK-NEXT: vl8re64.v v24, (a3) +; CHECK-NEXT: sltu a3, a4, a5 ; CHECK-NEXT: vl8re64.v v8, (a2) ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a3, a5, a3 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a5 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v16, v8, v24 +; CHECK-NEXT: vfmadd.vv v24, v16, v8 ; CHECK-NEXT: bltu a4, a1, .LBB93_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB93_2: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: vfmadd.vv v0, v16, v8 ; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll index 6fa6c26890c3e..343098e87649e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll @@ -453,12 +453,12 @@ define @vfneg_vv_nxv16f64( %va, @vfnmsub_vf_nxv4f16( %va, @vfnmsub_vf_nxv8f16( %va, @vfnmsub_vf_nxv16f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16( %va, @vfnmsub_vv_nxv1f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv1f16( %neg, %vb, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -51,14 +51,14 @@ define @vfnmsub_vf_nxv1f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -80,14 +80,14 @@ define @vfnmsub_vv_nxv2f16( %va, %va %vd = call @llvm.experimental.constrained.fma.nxv2f16( %neg, %vc, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -107,14 +107,14 @@ define @vfnmsub_vf_nxv2f16( %va, poison, half %c, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -136,14 +136,14 @@ define @vfnmsub_vv_nxv4f16( %va, %vb %vd = call @llvm.experimental.constrained.fma.nxv4f16( %neg, %va, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -163,12 +163,12 @@ define @vfnmsub_vf_nxv4f16( %va, @vfnmsub_vv_nxv8f16( %va, @vfnmsub_vf_nxv8f16( %va, @vfnmsub_vv_nxv16f16( %va, @vfnmsub_vf_nxv16f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16( %va, @vfpext_nxv32f16_nxv32f32( %a, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll index 37e14783d1873..d707b4254d3e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll @@ -628,17 +628,17 @@ define @vfptosi_nxv32bf16_nxv32i1( %va) ; CHECK-LABEL: vfptosi_nxv32bf16_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v16 -; CHECK-NEXT: vand.vi v12, v12, 1 -; CHECK-NEXT: vmsne.vi v16, v12, 0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v24 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v16 +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v24 +; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vand.vi v8, v8, 1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vand.vi v12, v12, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v0, v16, a0 ; CHECK-NEXT: ret @@ -650,17 +650,17 @@ define @vfptoui_nxv32bf16_nxv32i1( %va) ; CHECK-LABEL: vfptoui_nxv32bf16_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v16 -; CHECK-NEXT: vand.vi v12, v12, 1 -; CHECK-NEXT: vmsne.vi v16, v12, 0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v24 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v16 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v24 +; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vand.vi v8, v8, 1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vand.vi v12, v12, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v0, v16, a0 ; CHECK-NEXT: ret @@ -673,12 +673,12 @@ define @vfptosi_nxv32bf16_nxv32i8( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v16 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v24 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vnsrl.wi v10, v12, 0 ; CHECK-NEXT: ret @@ -691,12 +691,12 @@ define @vfptoui_nxv32bf16_nxv32i8( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfncvt.rtz.xu.f.w v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v16 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v24, 0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v24 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vnsrl.wi v10, v12, 0 ; CHECK-NEXT: ret @@ -1648,17 +1648,17 @@ define @vfptosi_nxv32f16_nxv32i1( %va) { ; ZVFHMIN-LABEL: vfptosi_nxv32f16_nxv32i1: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: srli a0, a0, 2 -; ZVFHMIN-NEXT: add a1, a0, a0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v16 -; ZVFHMIN-NEXT: vand.vi v12, v12, 1 -; ZVFHMIN-NEXT: vmsne.vi v16, v12, 0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v8, v24 +; ZVFHMIN-NEXT: srli a0, a0, 2 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v24 +; ZVFHMIN-NEXT: add a1, a0, a0 ; ZVFHMIN-NEXT: vand.vi v8, v8, 1 -; ZVFHMIN-NEXT: vmsne.vi v0, v8, 0 +; ZVFHMIN-NEXT: vand.vi v12, v12, 1 +; ZVFHMIN-NEXT: vmsne.vi v16, v8, 0 +; ZVFHMIN-NEXT: vmsne.vi v0, v12, 0 ; ZVFHMIN-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslideup.vx v0, v16, a0 ; ZVFHMIN-NEXT: ret @@ -1678,17 +1678,17 @@ define @vfptoui_nxv32f16_nxv32i1( %va) { ; ZVFHMIN-LABEL: vfptoui_nxv32f16_nxv32i1: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: srli a0, a0, 2 -; ZVFHMIN-NEXT: add a1, a0, a0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v16 -; ZVFHMIN-NEXT: vand.vi v12, v12, 1 -; ZVFHMIN-NEXT: vmsne.vi v16, v12, 0 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v8, v24 +; ZVFHMIN-NEXT: srli a0, a0, 2 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v24 +; ZVFHMIN-NEXT: add a1, a0, a0 ; ZVFHMIN-NEXT: vand.vi v8, v8, 1 -; ZVFHMIN-NEXT: vmsne.vi v0, v8, 0 +; ZVFHMIN-NEXT: vand.vi v12, v12, 1 +; ZVFHMIN-NEXT: vmsne.vi v16, v8, 0 +; ZVFHMIN-NEXT: vmsne.vi v0, v12, 0 ; ZVFHMIN-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslideup.vx v0, v16, a0 ; ZVFHMIN-NEXT: ret @@ -1708,12 +1708,12 @@ define @vfptosi_nxv32f16_nxv32i8( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v24, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; ZVFHMIN-NEXT: vnsrl.wi v8, v24, 0 +; ZVFHMIN-NEXT: vnsrl.wi v8, v12, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; ZVFHMIN-NEXT: vnsrl.wi v10, v12, 0 ; ZVFHMIN-NEXT: ret @@ -1733,12 +1733,12 @@ define @vfptoui_nxv32f16_nxv32i8( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v24, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; ZVFHMIN-NEXT: vnsrl.wi v8, v24, 0 +; ZVFHMIN-NEXT: vnsrl.wi v8, v12, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; ZVFHMIN-NEXT: vnsrl.wi v10, v12, 0 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll index 23d47cc3cb1fd..cf195c7c0935e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll @@ -508,42 +508,26 @@ declare @llvm.vp.fptosi.nxv32i16.nxv32f32( @vfptosi_nxv32i16_nxv32f32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v20, v24, v0.t +; CHECK-NEXT: vfncvt.rtz.x.f.w v28, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB34_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvt.rtz.x.f.w v24, v8, v0.t +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fptosi.nxv32i16.nxv32f32( %va, %m, i32 %evl) ret %v @@ -557,9 +541,9 @@ define @vfptosi_nxv32i32_nxv32f32( %va, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll index acd360f06046f..952d28604b86c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll @@ -508,42 +508,26 @@ declare @llvm.vp.fptoui.nxv32i16.nxv32f32( @vfptoui_nxv32i16_nxv32f32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptoui_nxv32i16_nxv32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v20, v24, v0.t +; CHECK-NEXT: vfncvt.rtz.xu.f.w v28, v16, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB34_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v24, v8, v0.t +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fptoui.nxv32i16.nxv32f32( %va, %m, i32 %evl) ret %v @@ -557,9 +541,9 @@ define @vfptoui_nxv32i32_nxv32f32( %va, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll index b2bfb10d39df3..874813f057595 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -107,12 +107,12 @@ define @vfptrunc_nxv16f32_nxv16f64( ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 -; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma @@ -157,35 +157,35 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v0, a4 -; CHECK-NEXT: slli a4, a1, 3 -; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vl8re64.v v8, (a4) +; CHECK-NEXT: srli a5, a1, 2 +; CHECK-NEXT: slli a6, a1, 3 ; CHECK-NEXT: slli a4, a1, 1 +; CHECK-NEXT: vsetvli a7, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v0, a5 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: sub a5, a2, a4 +; CHECK-NEXT: vl8re64.v v24, (a6) ; CHECK-NEXT: sltu a6, a2, a5 ; CHECK-NEXT: addi a6, a6, -1 ; CHECK-NEXT: and a5, a6, a5 ; CHECK-NEXT: sub a6, a5, a1 ; CHECK-NEXT: sltu a7, a5, a6 ; CHECK-NEXT: addi a7, a7, -1 -; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v16, a3 ; CHECK-NEXT: and a0, a7, a6 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v20, v8, v0.t +; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t ; CHECK-NEXT: bltu a5, a1, .LBB8_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a3 -; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t +; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t ; CHECK-NEXT: bltu a2, a4, .LBB8_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index d92db0b5a3a7b..d5e65e2c8fd3f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -92,13 +92,13 @@ define @vfsqrt_nxv32bf16( %v) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vfsqrt.v v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: ret @@ -229,13 +229,13 @@ define @vfsqrt_nxv32f16( %v) strictfp { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v16 +; ZVFHMIN-NEXT: vfsqrt.v v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index a51b0e4efecf2..4d761981aac97 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -87,13 +87,13 @@ define @vfsqrt_nxv32bf16( %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vfsqrt.v v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: ret @@ -224,13 +224,13 @@ define @vfsqrt_nxv32f16( %v) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v16 +; ZVFHMIN-NEXT: vfsqrt.v v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll index 00542284ebaee..8edcf23988c7f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll @@ -170,16 +170,16 @@ define @vfsqrt_vv_nxv32bf16( %va, < ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v24, v24, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 @@ -202,19 +202,19 @@ define @vfsqrt_vv_nxv32bf16_unmasked( @vfsqrt_vv_nxv32f16( %va, @vfsqrt_vv_nxv32f16_unmasked( %v ; ZVFHMIN-LABEL: vfsqrt_vv_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -752,12 +752,12 @@ define @vfsqrt_vv_nxv16f64( %va, @vfsub_vf_nxv16bf16( %va, bf define @vfsub_vv_nxv32bf16( %va, %vb) strictfp { ; CHECK-LABEL: vfsub_vv_nxv32bf16: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsub.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret entry: %vc = call @llvm.experimental.constrained.fsub.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -231,22 +246,39 @@ entry: define @vfsub_vf_nxv32bf16( %va, bfloat %b) strictfp { ; CHECK-LABEL: vfsub_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v24, v24, v0 +; CHECK-NEXT: vfsub.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vfsub.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -529,19 +561,34 @@ define @vfsub_vv_nxv32f16( %va, @llvm.experimental.constrained.fsub.nxv32f16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -557,22 +604,39 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfsub_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfsub.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24 +; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll index a2137eaa7a958..f80644777c72a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll @@ -204,19 +204,34 @@ define @vfsub_vf_nxv16bf16( %va, bf define @vfsub_vv_nxv32bf16( %va, %vb) { ; CHECK-LABEL: vfsub_vv_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v24, v0, v24 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v0, v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsub.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vc = fsub %va, %vb ret %vc @@ -225,22 +240,39 @@ define @vfsub_vv_nxv32bf16( %va, @vfsub_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfsub_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v24, v24, v0 +; CHECK-NEXT: vfsub.vv v0, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v24, v0 +; CHECK-NEXT: vfsub.vv v16, v24, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -512,19 +544,34 @@ define @vfsub_vv_nxv32f16( %va, %va, %vb ret %vc @@ -539,22 +586,39 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfsub_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v24, v24, v0 +; ZVFHMIN-NEXT: vfsub.vv v0, v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v20 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v24, v0 +; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll index 2d34bfff95c4c..25a80e66c4a52 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll @@ -375,20 +375,21 @@ define @vfsub_vv_nxv32bf16( %va, @vfsub_vv_nxv32bf16_unmasked( @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a1, a2, 1 -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: sltu a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 @@ -502,18 +505,18 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: sltu a2, a0, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a3, a4, a3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vfsub.vv v16, v8, v16, v0.t ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 @@ -565,31 +568,37 @@ define @vfsub_vf_nxv32bf16_unmasked( @vfsub_vv_nxv32f16( %va, @vfsub_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1225,23 +1236,24 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb -; ZVFHMIN-NEXT: vmv8r.v v24, v8 +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a2, a1, 3 -; ZVFHMIN-NEXT: add a1, a2, a1 +; ZVFHMIN-NEXT: slli a3, a1, 3 +; ZVFHMIN-NEXT: add a1, a3, a1 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -1249,18 +1261,18 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a4, a2, 3 -; ZVFHMIN-NEXT: add a2, a4, a2 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a4, a3, 3 +; ZVFHMIN-NEXT: add a3, a4, a3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsub.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 @@ -1318,31 +1330,37 @@ define @vfsub_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmset.m v7 +; ZVFHMIN-NEXT: addi a3, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a1 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 ; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vmv4r.v v16, v8 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a2 +; ZVFHMIN-NEXT: sltu a2, a0, a3 +; ZVFHMIN-NEXT: addi a2, a2, -1 +; ZVFHMIN-NEXT: and a2, a2, a3 +; ZVFHMIN-NEXT: csrr a3, vlenb +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: add a3, sp, a3 +; ZVFHMIN-NEXT: addi a3, a3, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 +; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll index 0f76968485fb4..4265663c1feee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll @@ -147,10 +147,10 @@ define @vsitofp_nxv32i1_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 -; CHECK-NEXT: vfwcvt.f.x.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vfwcvt.f.x.v v16, v8 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -168,10 +168,10 @@ define @vuitofp_nxv32i1_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 -; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -339,12 +339,11 @@ define @vsitofp_nxv32i8_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsext.vf2 v16, v10 ; CHECK-NEXT: vfwcvt.f.x.v v24, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v16, v24 -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vfwcvt.f.x.v v24, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v20, v24 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvt.f.x.v v24, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = sitofp %va to ret %evec @@ -355,12 +354,11 @@ define @vuitofp_nxv32i8_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vzext.vf2 v16, v10 ; CHECK-NEXT: vfwcvt.f.xu.v v24, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v16, v24 -; CHECK-NEXT: vzext.vf2 v12, v10 -; CHECK-NEXT: vfwcvt.f.xu.v v24, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v20, v24 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvt.f.xu.v v24, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = uitofp %va to ret %evec @@ -1157,10 +1155,10 @@ define @vsitofp_nxv32i1_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v12, 0 -; ZVFHMIN-NEXT: vmerge.vim v8, v12, -1, v0 -; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v8 ; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: vmerge.vim v8, v12, -1, v0 ; ZVFHMIN-NEXT: srli a0, a0, 2 +; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v8 ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -1186,10 +1184,10 @@ define @vuitofp_nxv32i1_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v12, 0 -; ZVFHMIN-NEXT: vmerge.vim v8, v12, 1, v0 -; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v8 ; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: vmerge.vim v8, v12, 1, v0 ; ZVFHMIN-NEXT: srli a0, a0, 2 +; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v8 ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -1648,12 +1646,11 @@ define @vsitofp_nxv32i8_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vsext.vf2 v12, v8 +; ZVFHMIN-NEXT: vsext.vf2 v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.x.v v24, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vsext.vf2 v12, v10 -; ZVFHMIN-NEXT: vfwcvt.f.x.v v24, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfwcvt.f.x.v v24, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = sitofp %va to ret %evec @@ -1671,12 +1668,11 @@ define @vuitofp_nxv32i8_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vzext.vf2 v12, v8 +; ZVFHMIN-NEXT: vzext.vf2 v16, v10 ; ZVFHMIN-NEXT: vfwcvt.f.xu.v v24, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v24 -; ZVFHMIN-NEXT: vzext.vf2 v12, v10 -; ZVFHMIN-NEXT: vfwcvt.f.xu.v v24, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v24 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 +; ZVFHMIN-NEXT: vfwcvt.f.xu.v v24, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = uitofp %va to ret %evec diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll index 1a01a9bf77cff..318a5bc92a779 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll @@ -14,10 +14,10 @@ define <2 x i32> @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_ ; CHECK-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v11, 0 ; CHECK-NEXT: vnsrl.wi v9, v11, 16 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vwadd.vv v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v10, 0 -; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vnsrl.wx v9, v10, a0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll index 1f778dc1d5c88..0b553d3cd6fdf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll @@ -11,10 +11,10 @@ define @vmax_vx_nxv8i7( %a, i7 signext %b, @vmax_vx_nxv32i32( %va, i32 %b, @vmax_vx_nxv32i32_evl_nx8( %va, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: srli a3, a1, 2 ; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index a2b160b5a0ebb..f6be882f74206 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -10,11 +10,10 @@ define @vmaxu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vmaxu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vmaxu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -976,9 +975,9 @@ define @vmaxu_vx_nxv32i32( %va, i32 %b, < ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a3 -; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 @@ -1035,10 +1034,10 @@ define @vmaxu_vx_nxv32i32_evl_nx8( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: srli a3, a1, 2 ; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index b074888ffffa8..8690014cc2c9d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -11,10 +11,10 @@ define @vmin_vx_nxv8i7( %a, i7 signext %b, @vmin_vx_nxv32i32( %va, i32 %b, @vmin_vx_nxv32i32_evl_nx8( %va, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: srli a3, a1, 2 ; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index c7afd549d8e92..414807829d563 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -10,11 +10,10 @@ define @vminu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vminu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vminu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -976,9 +975,9 @@ define @vminu_vx_nxv32i32( %va, i32 %b, < ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a3 -; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 @@ -1035,10 +1034,10 @@ define @vminu_vx_nxv32i32_evl_nx8( %va, i ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: srli a3, a1, 2 ; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: sltu a4, a1, a3 ; CHECK-NEXT: addi a4, a4, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll index ae3195c479ea9..3eb767f90f3bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll @@ -897,8 +897,8 @@ define @vmul_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32M-NEXT: mul a4, a0, a2 ; RV32M-NEXT: mul a3, a0, a3 ; RV32M-NEXT: mulhu a0, a0, a2 -; RV32M-NEXT: add a0, a0, a3 ; RV32M-NEXT: mul a1, a1, a2 +; RV32M-NEXT: add a0, a0, a3 ; RV32M-NEXT: add a0, a0, a1 ; RV32M-NEXT: sw a4, 8(sp) ; RV32M-NEXT: sw a0, 12(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll index 253cfb040308b..9ab293faf87d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll @@ -8,9 +8,9 @@ define @srem_eq_fold_nxv4i8( %va) { ; CHECK-LABEL: srem_eq_fold_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 42 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a1, -85 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmacc.vx v9, a1, v8 ; CHECK-NEXT: vsll.vi v8, v9, 7 ; CHECK-NEXT: vsrl.vi v9, v9, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll index 07661c5764045..4629db26ca034 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll @@ -349,11 +349,11 @@ define @intrinsic_vmv.s.x_x_nxv1i64_bug( %0 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a1, 0(a0) ; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vid.v v9 ; RV32-NEXT: vmseq.vi v0, v9, 0 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll index e840036c6a3da..507f5154cf1ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll @@ -446,9 +446,10 @@ define @test_vp_reverse_nxv64i8_masked( %sr ; CHECK-LABEL: test_vp_reverse_nxv64i8_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vrsub.vx v24, v16, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v23, v8, v24 @@ -459,7 +460,6 @@ define @test_vp_reverse_nxv64i8_masked( %sr ; CHECK-NEXT: vrgatherei16.vv v18, v13, v24 ; CHECK-NEXT: vrgatherei16.vv v17, v14, v24 ; CHECK-NEXT: vrgatherei16.vv v16, v15, v24 -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v16, a1, v0.t @@ -472,9 +472,10 @@ define @test_vp_reverse_nxv64i8( %src, i32 ; CHECK-LABEL: test_vp_reverse_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vrsub.vx v24, v16, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v23, v8, v24 @@ -485,7 +486,6 @@ define @test_vp_reverse_nxv64i8( %src, i32 ; CHECK-NEXT: vrgatherei16.vv v18, v13, v24 ; CHECK-NEXT: vrgatherei16.vv v17, v14, v24 ; CHECK-NEXT: vrgatherei16.vv v16, v15, v24 -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v16, a1 @@ -498,12 +498,12 @@ define @test_vp_reverse_nxv64i8( %src, i32 define @test_vp_reverse_nxv128i8( %src, i32 zeroext %evl) { ; CHECK-LABEL: test_vp_reverse_nxv128i8: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB32_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: addi sp, sp, -80 ; CHECK-NEXT: .cfi_def_cfa_offset 80 @@ -518,21 +518,21 @@ define @test_vp_reverse_nxv128i8( %src, i ; CHECK-NEXT: sub sp, sp, a3 ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: addi a3, sp, 64 -; CHECK-NEXT: add a4, a0, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: li a5, -1 -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vsse8.v v8, (a4), a5 -; CHECK-NEXT: sub a4, a4, a2 -; CHECK-NEXT: sub a6, a0, a1 -; CHECK-NEXT: sltu a0, a0, a6 +; CHECK-NEXT: li a4, -1 +; CHECK-NEXT: sub a5, a0, a2 +; CHECK-NEXT: add a6, a0, a3 +; CHECK-NEXT: sltu a0, a0, a5 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: addi a6, a6, -1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a6 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a6), a4 +; CHECK-NEXT: sub a6, a6, a1 +; CHECK-NEXT: and a0, a0, a5 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vsse8.v v16, (a4), a5 -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vle8.v v16, (a1) -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v16, (a6), a4 +; CHECK-NEXT: vle8.v v16, (a2) +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a3) ; CHECK-NEXT: addi sp, s0, -80 ; CHECK-NEXT: .cfi_def_cfa sp, 80 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll index 7f81b99eb0338..09d92c3c039f9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll @@ -7,11 +7,11 @@ define <2 x i1> @test_vp_reverse_v2i1_masked(<2 x i1> %src, <2 x i1> %mask, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -27,9 +27,11 @@ define <2 x i1> @test_vp_reverse_v2i1(<2 x i1> %src, i32 zeroext %evl) { ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -44,11 +46,11 @@ define <4 x i1> @test_vp_reverse_v4i1_masked(<4 x i1> %src, <4 x i1> %mask, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -64,9 +66,11 @@ define <4 x i1> @test_vp_reverse_v4i1(<4 x i1> %src, i32 zeroext %evl) { ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -81,11 +85,11 @@ define <8 x i1> @test_vp_reverse_v8i1_masked(<8 x i1> %src, <8 x i1> %mask, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -101,9 +105,11 @@ define <8 x i1> @test_vp_reverse_v8i1(<8 x i1> %src, i32 zeroext %evl) { ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -118,11 +124,11 @@ define <16 x i1> @test_vp_reverse_v16i1_masked(<16 x i1> %src, <16 x i1> %mask, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v9, v10, v0.t @@ -138,9 +144,11 @@ define <16 x i1> @test_vp_reverse_v16i1(<16 x i1> %src, i32 zeroext %evl) { ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v11, v10, v8 ; CHECK-NEXT: vmsne.vi v0, v11, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll index 8c1be2c1e9791..8e44d76e7010f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll @@ -6,11 +6,11 @@ define @test_vp_reverse_nxv1i1_masked( %src, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -26,9 +26,11 @@ define @test_vp_reverse_nxv1i1( %src, i32 zer ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -43,11 +45,11 @@ define @test_vp_reverse_nxv2i1_masked( %src, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -63,9 +65,11 @@ define @test_vp_reverse_nxv2i1( %src, i32 zer ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -80,11 +84,11 @@ define @test_vp_reverse_nxv4i1_masked( %src, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v9, v10, v0.t @@ -100,9 +104,11 @@ define @test_vp_reverse_nxv4i1( %src, i32 zer ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v10, v9, v8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 @@ -117,11 +123,11 @@ define @test_vp_reverse_nxv8i1_masked( %src, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v10, v10, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v9, v10, v0.t @@ -137,9 +143,11 @@ define @test_vp_reverse_nxv8i1( %src, i32 zer ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v11, v10, v8 ; CHECK-NEXT: vmsne.vi v0, v11, 0 @@ -154,11 +162,11 @@ define @test_vp_reverse_nxv16i1_masked( %sr ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vid.v v12, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v12, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vrgatherei16.vv v16, v10, v12, v0.t @@ -175,9 +183,11 @@ define @test_vp_reverse_nxv16i1( %src, i32 ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v14, v12, v8 ; CHECK-NEXT: vmsne.vi v0, v14, 0 @@ -192,11 +202,11 @@ define @test_vp_reverse_nxv32i1_masked( %sr ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vid.v v16, v0.t -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v16, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; CHECK-NEXT: vrgatherei16.vv v24, v12, v16, v0.t @@ -213,9 +223,11 @@ define @test_vp_reverse_nxv32i1( %src, i32 ; CHECK-NEXT: addi a1, a0, -1 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v8, v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vrgatherei16.vv v20, v16, v8 ; CHECK-NEXT: vmsne.vi v0, v20, 0 @@ -230,26 +242,28 @@ define @test_vp_reverse_nxv64i1_masked( %sr ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v10, v10, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v23, v24, v10 -; CHECK-NEXT: vrgatherei16.vv v22, v25, v10 -; CHECK-NEXT: vrgatherei16.vv v21, v26, v10 -; CHECK-NEXT: vrgatherei16.vv v20, v27, v10 -; CHECK-NEXT: vrgatherei16.vv v19, v28, v10 -; CHECK-NEXT: vrgatherei16.vv v18, v29, v10 -; CHECK-NEXT: vrgatherei16.vv v17, v30, v10 -; CHECK-NEXT: vrgatherei16.vv v16, v31, v10 -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vrgatherei16.vv v31, v16, v10 +; CHECK-NEXT: vrgatherei16.vv v30, v17, v10 +; CHECK-NEXT: vrgatherei16.vv v29, v18, v10 +; CHECK-NEXT: vrgatherei16.vv v28, v19, v10 +; CHECK-NEXT: vrgatherei16.vv v27, v20, v10 +; CHECK-NEXT: vrgatherei16.vv v26, v21, v10 +; CHECK-NEXT: vrgatherei16.vv v25, v22, v10 +; CHECK-NEXT: vrgatherei16.vv v24, v23, v10 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v16, a1, v0.t +; CHECK-NEXT: vslidedown.vx v16, v24, a1, v0.t ; CHECK-NEXT: vmsne.vi v8, v16, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret @@ -262,25 +276,27 @@ define @test_vp_reverse_nxv64i1( %src, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a2 +; CHECK-NEXT: vrsub.vx v24, v16, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v15, v16, v24 -; CHECK-NEXT: vrgatherei16.vv v14, v17, v24 -; CHECK-NEXT: vrgatherei16.vv v13, v18, v24 -; CHECK-NEXT: vrgatherei16.vv v12, v19, v24 -; CHECK-NEXT: vrgatherei16.vv v11, v20, v24 -; CHECK-NEXT: vrgatherei16.vv v10, v21, v24 -; CHECK-NEXT: vrgatherei16.vv v9, v22, v24 -; CHECK-NEXT: vrgatherei16.vv v8, v23, v24 -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vrgatherei16.vv v23, v8, v24 +; CHECK-NEXT: vrgatherei16.vv v22, v9, v24 +; CHECK-NEXT: vrgatherei16.vv v21, v10, v24 +; CHECK-NEXT: vrgatherei16.vv v20, v11, v24 +; CHECK-NEXT: vrgatherei16.vv v19, v12, v24 +; CHECK-NEXT: vrgatherei16.vv v18, v13, v24 +; CHECK-NEXT: vrgatherei16.vv v17, v14, v24 +; CHECK-NEXT: vrgatherei16.vv v16, v15, v24 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v8, v16, a1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll index 9496cd82947d4..2a137099bcb0f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll @@ -11,10 +11,10 @@ define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %ev ; CHECK-LABEL: test_vp_splice_v2i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -35,10 +35,10 @@ define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb, ; CHECK-LABEL: test_vp_splice_v2i1_negative_offset: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -59,10 +59,10 @@ define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1> ; CHECK-LABEL: test_vp_splice_v2i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v11, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v10 @@ -84,10 +84,10 @@ define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %ev ; CHECK-LABEL: test_vp_splice_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -108,10 +108,10 @@ define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb, ; CHECK-LABEL: test_vp_splice_v4i1_negative_offset: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -132,10 +132,10 @@ define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1> ; CHECK-LABEL: test_vp_splice_v4i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v11, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v10 @@ -157,10 +157,10 @@ define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %ev ; CHECK-LABEL: test_vp_splice_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -181,10 +181,10 @@ define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb, ; CHECK-LABEL: test_vp_splice_v8i1_negative_offset: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -205,10 +205,10 @@ define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1> ; CHECK-LABEL: test_vp_splice_v8i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v11, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v10 @@ -230,10 +230,10 @@ define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext ; CHECK-LABEL: test_vp_splice_v16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -254,10 +254,10 @@ define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1> ; CHECK-LABEL: test_vp_splice_v16i1_negative_offset: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v9 @@ -278,10 +278,10 @@ define <16 x i1> @test_vp_splice_v16i1_masked(<16 x i1> %va, <16 x i1> %vb, <16 ; CHECK-LABEL: test_vp_splice_v16i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v11, 1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vmv1r.v v0, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll index 9027630825227..fc446d0a3a88a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll @@ -14,10 +14,10 @@ define @test_vp_splice_nxv1i1( %va, @test_vp_splice_nxv1i1_negative_offset( @test_vp_splice_nxv1i1_masked( %va, @test_vp_splice_nxv2i1( %va, @test_vp_splice_nxv2i1_negative_offset( @test_vp_splice_nxv2i1_masked( %va, @test_vp_splice_nxv4i1( %va, @test_vp_splice_nxv4i1_negative_offset( @test_vp_splice_nxv4i1_masked( %va, @test_vp_splice_nxv8i1( %va, @test_vp_splice_nxv8i1_negative_offset( @test_vp_splice_nxv8i1_masked( %va, @test_vp_splice_nxv16i1( %va, @test_vp_splice_nxv16i1_negative_offset( @test_vp_splice_nxv16i1_masked( %va, ; CHECK-LABEL: test_vp_splice_nxv16i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v14, 0 @@ -380,9 +380,9 @@ define @test_vp_splice_nxv32i1( %va, @test_vp_splice_nxv32i1_negative_offset( @test_vp_splice_nxv32i1_masked( %va, ; CHECK-LABEL: test_vp_splice_nxv32i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 @@ -454,9 +454,9 @@ define @test_vp_splice_nxv64i1( %va, @test_vp_splice_nxv64i1_negative_offset( @test_vp_splice_nxv64i1_masked( %va, ; CHECK-LABEL: test_vp_splice_nxv64i1_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll index 34f8f35ee98c0..3e423c8ec9903 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -261,16 +261,16 @@ define @vpgather_baseidx_nxv32i8(ptr %base, @vpgather_baseidx_nxv32i8(ptr %base, @vpgather_baseidx_nxv32i8(ptr %base, @vpgather_nxv16f64( %ptrs, @vpgather_nxv16f64( %ptrs, @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: sub a3, a1, a2 ; RV32-NEXT: srli a4, a2, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, ma @@ -2592,8 +2592,8 @@ define @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v10 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: sub a3, a1, a2 ; RV64-NEXT: srli a4, a2, 3 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma @@ -2626,8 +2626,8 @@ define @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vzext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: sub a3, a1, a2 ; RV32-NEXT: srli a4, a2, 3 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, ma @@ -2651,8 +2651,8 @@ define @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV64-NEXT: vzext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: sub a3, a1, a2 ; RV64-NEXT: srli a4, a2, 3 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll index 5683a7b758854..8dfab72d008c2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -513,13 +513,13 @@ define @vpload_nxv16f64(ptr %ptr, %m, ; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: sub a3, a1, a2 -; CHECK-NEXT: sltu a4, a1, a3 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: slli a4, a2, 3 ; CHECK-NEXT: srli a5, a2, 3 ; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: sltu a5, a1, a3 +; CHECK-NEXT: addi a5, a5, -1 +; CHECK-NEXT: and a3, a5, a3 ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a4), v0.t @@ -549,26 +549,26 @@ declare @llvm.vector.extract.nxv16f64( @vpload_nxv17f64(ptr %ptr, ptr %out, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpload_nxv17f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: slli a5, a3, 1 -; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: mv a4, a2 ; CHECK-NEXT: bltu a2, a5, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: sub a6, a4, a3 -; CHECK-NEXT: sltu a7, a4, a6 -; CHECK-NEXT: addi a7, a7, -1 -; CHECK-NEXT: and a6, a7, a6 ; CHECK-NEXT: slli a7, a3, 3 ; CHECK-NEXT: srli t0, a3, 3 +; CHECK-NEXT: sub a5, a2, a5 ; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v8, t0 +; CHECK-NEXT: sltu t0, a4, a6 ; CHECK-NEXT: add a7, a0, a7 +; CHECK-NEXT: addi t0, t0, -1 +; CHECK-NEXT: and a6, t0, a6 ; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a7), v0.t -; CHECK-NEXT: sub a5, a2, a5 ; CHECK-NEXT: sltu a2, a2, a5 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a5 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll index 3e3276788cd53..f029d0b1b01bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll @@ -366,16 +366,16 @@ define @vpmerge_vv_nxv128i8( %va, %val, ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t ; RV32-NEXT: sub a2, a1, a0 +; RV32-NEXT: srli a0, a0, 3 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: srli a0, a0, 3 ; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a0 ; RV32-NEXT: and a1, a1, a2 @@ -2323,12 +2323,12 @@ define void @vpscatter_nxv16f64( %val, ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a3, a1, 3 ; RV64-NEXT: add a3, a0, a3 -; RV64-NEXT: vl8re64.v v24, (a3) -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vl8re64.v v16, (a3) ; RV64-NEXT: vl8re64.v v24, (a0) ; RV64-NEXT: mv a0, a2 ; RV64-NEXT: bltu a2, a1, .LBB108_2 @@ -2338,16 +2338,16 @@ define void @vpscatter_nxv16f64( %val, ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: sub a0, a2, a1 +; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a0 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -2362,27 +2362,41 @@ define void @vpscatter_nxv16f64( %val, define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl4re16.v v24, (a1) ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf2 v24, v4 +; RV32-NEXT: vsext.vf2 v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsll.vi v24, v0, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB109_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB109_2: +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: sub a3, a2, a1 +; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: @@ -2393,19 +2407,19 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vl4re16.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v26 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vsext.vf4 v16, v24 -; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB109_2 @@ -2415,9 +2429,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 @@ -2445,27 +2459,41 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %val, ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl4re16.v v24, (a1) ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf2 v24, v4 +; RV32-NEXT: vsext.vf2 v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsll.vi v24, v0, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB110_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB110_2: +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: sub a3, a2, a1 +; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64: @@ -2473,48 +2501,48 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a4, a3, 3 -; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vl4re16.v v24, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v0, v24 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v26 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsext.vf4 v16, v24 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB110_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB110_2: -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -2529,52 +2557,80 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %val, ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl4re16.v v24, (a1) ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vzext.vf2 v24, v4 +; RV32-NEXT: vzext.vf2 v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsll.vi v24, v0, 3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB111_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: .LBB111_2: +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: sub a3, a2, a1 +; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: ; RV64: # %bb.0: -; RV64-NEXT: vl4re16.v v4, (a1) +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vl4re16.v v24, (a1) ; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vzext.vf2 v24, v4 +; RV64-NEXT: vzext.vf2 v0, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsll.vi v24, v0, 3 ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB111_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB111_2: +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV64-NEXT: sub a3, a2, a1 +; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %eidxs = zext %idxs to %ptrs = getelementptr inbounds double, ptr %base, %eidxs diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll index 6d8574c11d1dd..549f57a01f38f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll @@ -427,15 +427,15 @@ define void @vpstore_nxv16f64( %val, ptr %ptr, , ptr, %val, ptr %ptr, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpstore_nxv17f64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: slli a4, a3, 1 +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: sub sp, sp, a3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a4, a3, 1 ; CHECK-NEXT: mv a5, a2 ; CHECK-NEXT: bltu a2, a4, .LBB35_2 ; CHECK-NEXT: # %bb.1: @@ -463,33 +471,27 @@ define void @vpstore_nxv17f64( %val, ptr %ptr, %val, ptr %ptr, %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v12, fa0 @@ -963,11 +963,11 @@ define half @vreduce_fadd_nxv3f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv3f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: lui a1, 1048568 ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma @@ -982,11 +982,11 @@ define half @vreduce_fadd_nxv6f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv6f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: lui a1, 1048568 ; CHECK-NEXT: vmv.s.x v11, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll index 13d1ac5088479..a20f88c697716 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -95,9 +95,9 @@ define half @vpreduce_fadd_nxv64f16(half %s, %v, %v, %val, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfredmin.vs v10, v8, v10, v0.t ; CHECK-NEXT: vmfne.vv v11, v8, v8, v0.t ; CHECK-NEXT: vcpop.m a0, v11, v0.t -; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: beqz a0, .LBB22_2 @@ -373,11 +373,11 @@ define float @vreduce_fmaximum_nxv4f32(float %start, %val, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vfredmax.vs v10, v8, v10, v0.t ; CHECK-NEXT: vmfne.vv v11, v8, v8, v0.t ; CHECK-NEXT: vcpop.m a0, v11, v0.t -; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: beqz a0, .LBB23_2 @@ -423,11 +423,11 @@ define float @vreduce_fminimum_v4f32(float %start, <4 x float> %val, <4 x i1> %m ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v9, v8, v9, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t ; CHECK-NEXT: vcpop.m a0, v8, v0.t -; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: beqz a0, .LBB26_2 @@ -447,11 +447,11 @@ define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v9, v8, v9, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t ; CHECK-NEXT: vcpop.m a0, v8, v0.t -; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: beqz a0, .LBB27_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll index 6222d7f4ae77f..eacfce098bddb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -1100,9 +1100,9 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, % ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 +; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v24, v0, a2 -; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a2, a1, a3 ; CHECK-NEXT: sltu a4, a1, a2 ; CHECK-NEXT: addi a4, a4, -1 @@ -1226,10 +1226,10 @@ define signext i64 @vpreduce_add_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1259,11 +1259,11 @@ define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma ; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1294,11 +1294,11 @@ define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, %v ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma ; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1331,10 +1331,10 @@ define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1366,10 +1366,10 @@ define signext i64 @vpreduce_smax_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmax.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1401,10 +1401,10 @@ define signext i64 @vpreduce_umin_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1436,10 +1436,10 @@ define signext i64 @vpreduce_smin_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredmin.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1471,10 +1471,10 @@ define signext i64 @vpreduce_and_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredand.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1506,10 +1506,10 @@ define signext i64 @vpreduce_or_nxv1i64(i64 signext %s, %v, < ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredor.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1541,10 +1541,10 @@ define signext i64 @vpreduce_xor_nxv1i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma ; RV32-NEXT: vredxor.vs v9, v8, v9, v0.t ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1576,10 +1576,10 @@ define signext i64 @vpreduce_add_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredsum.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1609,11 +1609,11 @@ define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1644,11 +1644,11 @@ define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, %v ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; RV32-NEXT: vwredsum.vs v9, v8, v9, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1681,10 +1681,10 @@ define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmaxu.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1716,10 +1716,10 @@ define signext i64 @vpreduce_smax_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmax.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1751,10 +1751,10 @@ define signext i64 @vpreduce_umin_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredminu.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1786,10 +1786,10 @@ define signext i64 @vpreduce_smin_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredmin.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1821,10 +1821,10 @@ define signext i64 @vpreduce_and_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredand.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1856,10 +1856,10 @@ define signext i64 @vpreduce_or_nxv2i64(i64 signext %s, %v, < ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredor.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1891,10 +1891,10 @@ define signext i64 @vpreduce_xor_nxv2i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma ; RV32-NEXT: vredxor.vs v10, v8, v10, v0.t ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1926,10 +1926,10 @@ define signext i64 @vpreduce_add_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredsum.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1959,11 +1959,11 @@ define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma ; RV32-NEXT: vwredsum.vs v10, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1994,11 +1994,11 @@ define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, %v ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma ; RV32-NEXT: vwredsumu.vs v10, v8, v10, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -2031,10 +2031,10 @@ define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredmaxu.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2066,10 +2066,10 @@ define signext i64 @vpreduce_smax_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredmax.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2101,10 +2101,10 @@ define signext i64 @vpreduce_umin_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredminu.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2136,10 +2136,10 @@ define signext i64 @vpreduce_smin_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredmin.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2171,10 +2171,10 @@ define signext i64 @vpreduce_and_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredand.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2206,10 +2206,10 @@ define signext i64 @vpreduce_or_nxv4i64(i64 signext %s, %v, < ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredor.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -2241,10 +2241,10 @@ define signext i64 @vpreduce_xor_nxv4i64(i64 signext %s, %v, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma ; RV32-NEXT: vredxor.vs v12, v8, v12, v0.t ; RV32-NEXT: vmv.x.s a0, v12 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v12, a1 ; RV32-NEXT: vmv.x.s a1, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll index bcab7d05e698e..fac5e31ecf94e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll @@ -1145,9 +1145,9 @@ define i64 @vreduce_add_nxv1i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1169,11 +1169,11 @@ define i64 @vwreduce_add_nxv1i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1197,11 +1197,11 @@ define i64 @vwreduce_uadd_nxv1i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1371,9 +1371,9 @@ define i64 @vreduce_xor_nxv1i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1397,9 +1397,9 @@ define i64 @vreduce_add_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1421,11 +1421,11 @@ define i64 @vwreduce_add_nxv2i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1449,11 +1449,11 @@ define i64 @vwreduce_uadd_nxv2i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v9 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1479,8 +1479,8 @@ define i64 @vreduce_umax_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1503,8 +1503,8 @@ define i64 @vreduce_smax_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredmax.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1527,8 +1527,8 @@ define i64 @vreduce_umin_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredminu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1551,8 +1551,8 @@ define i64 @vreduce_smin_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredmin.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1575,8 +1575,8 @@ define i64 @vreduce_and_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredand.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1599,8 +1599,8 @@ define i64 @vreduce_or_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vredor.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1623,9 +1623,9 @@ define i64 @vreduce_xor_nxv2i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v10 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1649,9 +1649,9 @@ define i64 @vreduce_add_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredsum.vs v8, v8, v12 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1673,11 +1673,11 @@ define i64 @vwreduce_add_nxv4i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV32-NEXT: vwredsum.vs v8, v8, v10 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1701,11 +1701,11 @@ define i64 @vwreduce_uadd_nxv4i32( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.s.x v10, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV32-NEXT: vwredsumu.vs v8, v8, v10 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -1731,8 +1731,8 @@ define i64 @vreduce_umax_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1755,8 +1755,8 @@ define i64 @vreduce_smax_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredmax.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1779,8 +1779,8 @@ define i64 @vreduce_umin_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredminu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1803,8 +1803,8 @@ define i64 @vreduce_smin_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredmin.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1827,8 +1827,8 @@ define i64 @vreduce_and_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredand.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1851,8 +1851,8 @@ define i64 @vreduce_or_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vredor.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 @@ -1875,9 +1875,9 @@ define i64 @vreduce_xor_nxv4i64( %v) { ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vmv.s.x v12, zero +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vredxor.vs v8, v8, v12 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll index 509bad44bb321..9c21a626478e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll @@ -966,16 +966,16 @@ define @vrem_vi_nxv1i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-V-NEXT: vlse64.v v9, (a0), zero -; RV32-V-NEXT: vmulh.vv v9, v8, v9 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v9, v8, v9 ; RV32-V-NEXT: vsrl.vx v10, v9, a0 ; RV32-V-NEXT: vsra.vi v9, v9, 1 ; RV32-V-NEXT: vadd.vv v9, v9, v10 @@ -1051,16 +1051,16 @@ define @vrem_vi_nxv2i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32-V-NEXT: vlse64.v v10, (a0), zero -; RV32-V-NEXT: vmulh.vv v10, v8, v10 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v10, v8, v10 ; RV32-V-NEXT: vsrl.vx v12, v10, a0 ; RV32-V-NEXT: vsra.vi v10, v10, 1 ; RV32-V-NEXT: vadd.vv v10, v10, v12 @@ -1136,16 +1136,16 @@ define @vrem_vi_nxv4i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32-V-NEXT: vlse64.v v12, (a0), zero -; RV32-V-NEXT: vmulh.vv v12, v8, v12 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v12, v8, v12 ; RV32-V-NEXT: vsrl.vx v16, v12, a0 ; RV32-V-NEXT: vsra.vi v12, v12, 1 ; RV32-V-NEXT: vadd.vv v12, v12, v16 @@ -1221,16 +1221,16 @@ define @vrem_vi_nxv8i64_0( %va) { ; RV32-V-NEXT: addi sp, sp, -16 ; RV32-V-NEXT: .cfi_def_cfa_offset 16 ; RV32-V-NEXT: lui a0, 748983 -; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: lui a1, 898779 +; RV32-V-NEXT: addi a0, a0, -586 ; RV32-V-NEXT: addi a1, a1, 1755 ; RV32-V-NEXT: sw a1, 8(sp) ; RV32-V-NEXT: sw a0, 12(sp) ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-V-NEXT: vlse64.v v16, (a0), zero -; RV32-V-NEXT: vmulh.vv v16, v8, v16 ; RV32-V-NEXT: li a0, 63 +; RV32-V-NEXT: vmulh.vv v16, v8, v16 ; RV32-V-NEXT: vsrl.vx v24, v16, a0 ; RV32-V-NEXT: vsra.vi v16, v16, 1 ; RV32-V-NEXT: vadd.vv v16, v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll index 95d8533bc53b2..ba6d95c5a43b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll @@ -11,10 +11,10 @@ define @vrem_vx_nxv8i7( %a, i7 signext %b, @vremu_vi_nxv1i64_0( %va) { ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-V-NEXT: vlse64.v v9, (a0), zero -; RV32-V-NEXT: vmulhu.vv v9, v8, v9 ; RV32-V-NEXT: li a0, 61 +; RV32-V-NEXT: vmulhu.vv v9, v8, v9 ; RV32-V-NEXT: vsrl.vx v9, v9, a0 ; RV32-V-NEXT: li a0, -7 ; RV32-V-NEXT: vnmsac.vx v8, a0, v9 @@ -818,8 +818,8 @@ define @vremu_vi_nxv2i64_0( %va) { ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32-V-NEXT: vlse64.v v10, (a0), zero -; RV32-V-NEXT: vmulhu.vv v10, v8, v10 ; RV32-V-NEXT: li a0, 61 +; RV32-V-NEXT: vmulhu.vv v10, v8, v10 ; RV32-V-NEXT: vsrl.vx v10, v10, a0 ; RV32-V-NEXT: li a0, -7 ; RV32-V-NEXT: vnmsac.vx v8, a0, v10 @@ -925,8 +925,8 @@ define @vremu_vi_nxv4i64_0( %va) { ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32-V-NEXT: vlse64.v v12, (a0), zero -; RV32-V-NEXT: vmulhu.vv v12, v8, v12 ; RV32-V-NEXT: li a0, 61 +; RV32-V-NEXT: vmulhu.vv v12, v8, v12 ; RV32-V-NEXT: vsrl.vx v12, v12, a0 ; RV32-V-NEXT: li a0, -7 ; RV32-V-NEXT: vnmsac.vx v8, a0, v12 @@ -1032,8 +1032,8 @@ define @vremu_vi_nxv8i64_0( %va) { ; RV32-V-NEXT: addi a0, sp, 8 ; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-V-NEXT: vlse64.v v16, (a0), zero -; RV32-V-NEXT: vmulhu.vv v16, v8, v16 ; RV32-V-NEXT: li a0, 61 +; RV32-V-NEXT: vmulhu.vv v16, v8, v16 ; RV32-V-NEXT: vsrl.vx v16, v16, a0 ; RV32-V-NEXT: li a0, -7 ; RV32-V-NEXT: vnmsac.vx v8, a0, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll index 25ef62386699c..6b32f101f6d3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll @@ -10,11 +10,10 @@ define @vremu_vx_nxv8i7( %a, i7 signext %b, < ; CHECK-LABEL: vremu_vx_nxv8i7: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2, v0.t ; CHECK-NEXT: vand.vx v9, v9, a2, v0.t ; CHECK-NEXT: vremu.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll index 4a86b717f9f3c..cb925bfb0f237 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll @@ -11,8 +11,8 @@ define @vrol_vv_nxv1i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -33,8 +33,8 @@ define @vrol_vx_nxv1i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -58,8 +58,8 @@ define @vrol_vv_nxv2i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -80,8 +80,8 @@ define @vrol_vx_nxv2i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -105,8 +105,8 @@ define @vrol_vv_nxv4i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -127,8 +127,8 @@ define @vrol_vx_nxv4i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -152,8 +152,8 @@ define @vrol_vv_nxv8i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -174,8 +174,8 @@ define @vrol_vx_nxv8i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -199,8 +199,8 @@ define @vrol_vv_nxv16i8( %a, @vrol_vx_nxv16i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -246,8 +246,8 @@ define @vrol_vv_nxv32i8( %a, @vrol_vx_nxv32i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -293,8 +293,8 @@ define @vrol_vv_nxv64i8( %a, @vrol_vx_nxv64i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vand.vi v24, v16, 7 -; CHECK-NEXT: vsll.vv v24, v8, v24 ; CHECK-NEXT: vrsub.vi v16, v16, 0 +; CHECK-NEXT: vsll.vv v24, v8, v24 ; CHECK-NEXT: vand.vi v16, v16, 7 ; CHECK-NEXT: vsrl.vv v8, v8, v16 ; CHECK-NEXT: vor.vv v8, v24, v8 @@ -340,8 +340,8 @@ define @vrol_vv_nxv1i16( %a, @vrol_vx_nxv1i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -387,8 +387,8 @@ define @vrol_vv_nxv2i16( %a, @vrol_vx_nxv2i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -434,8 +434,8 @@ define @vrol_vv_nxv4i16( %a, @vrol_vx_nxv4i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsll.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -481,8 +481,8 @@ define @vrol_vv_nxv8i16( %a, @vrol_vx_nxv8i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsll.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -528,8 +528,8 @@ define @vrol_vv_nxv16i16( %a, @vrol_vx_nxv16i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsll.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -575,8 +575,8 @@ define @vrol_vv_nxv32i16( %a, @vrol_vx_nxv32i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vand.vi v24, v16, 15 -; CHECK-NEXT: vsll.vv v24, v8, v24 ; CHECK-NEXT: vrsub.vi v16, v16, 0 +; CHECK-NEXT: vsll.vv v24, v8, v24 ; CHECK-NEXT: vand.vi v16, v16, 15 ; CHECK-NEXT: vsrl.vv v8, v8, v16 ; CHECK-NEXT: vor.vv v8, v24, v8 @@ -622,12 +622,12 @@ define @vrol_vv_nxv1i32( %a, @vrol_vx_nxv1i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vrol_vx_nxv1i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vsll.vx v9, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v9, v8 @@ -657,8 +657,8 @@ define @vrol_vx_nxv1i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v9, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV64-NEXT: vand.vx v9, v9, a0 ; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -682,12 +682,12 @@ define @vrol_vv_nxv2i32( %a, @vrol_vx_nxv2i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vrol_vx_nxv2i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-RV32-NEXT: vsll.vx v9, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v9, v8 @@ -717,8 +717,8 @@ define @vrol_vx_nxv2i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v9, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 ; CHECK-RV64-NEXT: vand.vx v9, v9, a0 ; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -742,12 +742,12 @@ define @vrol_vv_nxv4i32( %a, @vrol_vx_nxv4i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vrol_vx_nxv4i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: vsll.vx v10, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v10, v8 @@ -777,8 +777,8 @@ define @vrol_vx_nxv4i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v10, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsll.vv v12, v8, v12 ; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV64-NEXT: vsll.vv v12, v8, v12 ; CHECK-RV64-NEXT: vand.vx v10, v10, a0 ; CHECK-RV64-NEXT: vsrl.vv v8, v8, v10 ; CHECK-RV64-NEXT: vor.vv v8, v12, v8 @@ -802,12 +802,12 @@ define @vrol_vv_nxv8i32( %a, @vrol_vx_nxv8i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vrol_vx_nxv8i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; CHECK-RV32-NEXT: vsll.vx v12, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v12, v8 @@ -837,8 +837,8 @@ define @vrol_vx_nxv8i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v12, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsll.vv v16, v8, v16 ; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV64-NEXT: vsll.vv v16, v8, v16 ; CHECK-RV64-NEXT: vand.vx v12, v12, a0 ; CHECK-RV64-NEXT: vsrl.vv v8, v8, v12 ; CHECK-RV64-NEXT: vor.vv v8, v16, v8 @@ -862,12 +862,12 @@ define @vrol_vv_nxv16i32( %a, @vrol_vx_nxv16i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vrol_vx_nxv16i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; CHECK-RV32-NEXT: vsll.vx v16, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v16, v8 @@ -897,8 +897,8 @@ define @vrol_vx_nxv16i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v16, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v24, v16, a0 -; CHECK-RV64-NEXT: vsll.vv v24, v8, v24 ; CHECK-RV64-NEXT: vrsub.vi v16, v16, 0 +; CHECK-RV64-NEXT: vsll.vv v24, v8, v24 ; CHECK-RV64-NEXT: vand.vx v16, v16, a0 ; CHECK-RV64-NEXT: vsrl.vv v8, v8, v16 ; CHECK-RV64-NEXT: vor.vv v8, v24, v8 @@ -922,12 +922,12 @@ define @vrol_vv_nxv1i64( %a, @vrol_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vand.vx v10, v11, a0 ; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 ; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret @@ -959,9 +959,9 @@ define @vrol_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsll.vx v9, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v9, v8 @@ -985,12 +985,12 @@ define @vrol_vv_nxv2i64( %a, @vrol_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vand.vx v12, v14, a0 ; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 ; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret @@ -1022,9 +1022,9 @@ define @vrol_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsll.vx v10, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -1048,12 +1048,12 @@ define @vrol_vv_nxv4i64( %a, @vrol_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vand.vx v16, v20, a0 ; CHECK-RV32-NEXT: vsrl.vv v16, v8, v16 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 ; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret @@ -1085,9 +1085,9 @@ define @vrol_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsll.vx v12, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v12, v8 @@ -1111,12 +1111,12 @@ define @vrol_vv_nxv8i64( %a, @vrol_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v0, v24, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 ; CHECK-RV32-NEXT: vand.vx v24, v0, a0 ; CHECK-RV32-NEXT: vsrl.vv v24, v8, v24 -; CHECK-RV32-NEXT: vand.vx v16, v16, a0 ; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 ; CHECK-RV32-NEXT: vor.vv v8, v8, v24 ; CHECK-RV32-NEXT: ret @@ -1148,9 +1148,9 @@ define @vrol_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsll.vx v16, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsrl.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v16, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll index cf2f0d8873165..74743f9068a64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll @@ -12,8 +12,8 @@ define @vror_vv_nxv1i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -34,8 +34,8 @@ define @vror_vx_nxv1i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -96,8 +96,8 @@ define @vror_vv_nxv2i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -118,8 +118,8 @@ define @vror_vx_nxv2i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -180,8 +180,8 @@ define @vror_vv_nxv4i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -202,8 +202,8 @@ define @vror_vx_nxv4i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -264,8 +264,8 @@ define @vror_vv_nxv8i8( %a, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -286,8 +286,8 @@ define @vror_vx_nxv8i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 7 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 7 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -348,8 +348,8 @@ define @vror_vv_nxv16i8( %a, @vror_vx_nxv16i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 7 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 7 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -432,8 +432,8 @@ define @vror_vv_nxv32i8( %a, @vror_vx_nxv32i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 7 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 7 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -516,8 +516,8 @@ define @vror_vv_nxv64i8( %a, @vror_vx_nxv64i8( %a, i8 %b) { ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vand.vi v24, v16, 7 -; CHECK-NEXT: vsrl.vv v24, v8, v24 ; CHECK-NEXT: vrsub.vi v16, v16, 0 +; CHECK-NEXT: vsrl.vv v24, v8, v24 ; CHECK-NEXT: vand.vi v16, v16, 7 ; CHECK-NEXT: vsll.vv v8, v8, v16 ; CHECK-NEXT: vor.vv v8, v24, v8 @@ -600,8 +600,8 @@ define @vror_vv_nxv1i16( %a, @vror_vx_nxv1i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -684,8 +684,8 @@ define @vror_vv_nxv2i16( %a, @vror_vx_nxv2i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -768,8 +768,8 @@ define @vror_vv_nxv4i16( %a, @vror_vx_nxv4i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vand.vi v10, v9, 15 -; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 ; CHECK-NEXT: vand.vi v9, v9, 15 ; CHECK-NEXT: vsll.vv v8, v8, v9 ; CHECK-NEXT: vor.vv v8, v10, v8 @@ -852,8 +852,8 @@ define @vror_vv_nxv8i16( %a, @vror_vx_nxv8i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vand.vi v12, v10, 15 -; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 ; CHECK-NEXT: vand.vi v10, v10, 15 ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v12, v8 @@ -936,8 +936,8 @@ define @vror_vv_nxv16i16( %a, @vror_vx_nxv16i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: vand.vi v16, v12, 15 -; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 ; CHECK-NEXT: vand.vi v12, v12, 15 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v16, v8 @@ -1020,8 +1020,8 @@ define @vror_vv_nxv32i16( %a, @vror_vx_nxv32i16( %a, i16 %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vand.vi v24, v16, 15 -; CHECK-NEXT: vsrl.vv v24, v8, v24 ; CHECK-NEXT: vrsub.vi v16, v16, 0 +; CHECK-NEXT: vsrl.vv v24, v8, v24 ; CHECK-NEXT: vand.vi v16, v16, 15 ; CHECK-NEXT: vsll.vv v8, v8, v16 ; CHECK-NEXT: vor.vv v8, v24, v8 @@ -1104,12 +1104,12 @@ define @vror_vv_nxv1i32( %a, @vror_vx_nxv1i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vror_vx_nxv1i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v9, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v9, v8 @@ -1139,8 +1139,8 @@ define @vror_vx_nxv1i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v9, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vand.vx v9, v9, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -1201,12 +1201,12 @@ define @vror_vv_nxv2i32( %a, @vror_vx_nxv2i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vror_vx_nxv2i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v9, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v9, v8 @@ -1236,8 +1236,8 @@ define @vror_vx_nxv2i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v9, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 ; CHECK-RV64-NEXT: vand.vx v9, v9, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -1298,12 +1298,12 @@ define @vror_vv_nxv4i32( %a, @vror_vx_nxv4i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vror_vx_nxv4i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v10, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v10, v8 @@ -1333,8 +1333,8 @@ define @vror_vx_nxv4i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v10, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 ; CHECK-RV64-NEXT: vand.vx v10, v10, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v10 ; CHECK-RV64-NEXT: vor.vv v8, v12, v8 @@ -1395,12 +1395,12 @@ define @vror_vv_nxv8i32( %a, @vror_vx_nxv8i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vror_vx_nxv8i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v12, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v12, v8 @@ -1430,8 +1430,8 @@ define @vror_vx_nxv8i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v12, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 ; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 ; CHECK-RV64-NEXT: vand.vx v12, v12, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v12 ; CHECK-RV64-NEXT: vor.vv v8, v16, v8 @@ -1492,12 +1492,12 @@ define @vror_vv_nxv16i32( %a, @vror_vx_nxv16i32( %a, i32 %b) { ; CHECK-RV32-LABEL: vror_vx_nxv16i32: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: andi a1, a0, 31 +; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v8, a1 -; CHECK-RV32-NEXT: neg a0, a0 ; CHECK-RV32-NEXT: andi a0, a0, 31 ; CHECK-RV32-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV32-NEXT: vor.vv v8, v16, v8 @@ -1527,8 +1527,8 @@ define @vror_vx_nxv16i32( %a, i32 %b) { ; CHECK-RV64-NEXT: vmv.v.x v16, a0 ; CHECK-RV64-NEXT: li a0, 31 ; CHECK-RV64-NEXT: vand.vx v24, v16, a0 -; CHECK-RV64-NEXT: vsrl.vv v24, v8, v24 ; CHECK-RV64-NEXT: vrsub.vi v16, v16, 0 +; CHECK-RV64-NEXT: vsrl.vv v24, v8, v24 ; CHECK-RV64-NEXT: vand.vx v16, v16, a0 ; CHECK-RV64-NEXT: vsll.vv v8, v8, v16 ; CHECK-RV64-NEXT: vor.vv v8, v24, v8 @@ -1589,12 +1589,12 @@ define @vror_vv_nxv1i64( %a, @vror_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vand.vx v10, v11, a0 ; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 ; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret @@ -1626,9 +1626,9 @@ define @vror_vx_nxv1i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v9, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v9, v8 @@ -1691,12 +1691,12 @@ define @vror_vv_nxv2i64( %a, @vror_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v14, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vand.vx v12, v14, a0 ; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 ; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret @@ -1728,9 +1728,9 @@ define @vror_vx_nxv2i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v10, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v10, v8 @@ -1793,12 +1793,12 @@ define @vror_vv_nxv4i64( %a, @vror_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v20, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vand.vx v16, v20, a0 ; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 ; CHECK-RV32-NEXT: vor.vv v8, v8, v16 ; CHECK-RV32-NEXT: ret @@ -1830,9 +1830,9 @@ define @vror_vx_nxv4i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v12, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v12, v8 @@ -1895,12 +1895,12 @@ define @vror_vv_nxv8i64( %a, @vror_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV32-NEXT: vwsub.vx v0, v24, a0 ; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 ; CHECK-RV32-NEXT: vand.vx v24, v0, a0 ; CHECK-RV32-NEXT: vsll.vv v24, v8, v24 -; CHECK-RV32-NEXT: vand.vx v16, v16, a0 ; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 ; CHECK-RV32-NEXT: vor.vv v8, v8, v24 ; CHECK-RV32-NEXT: ret @@ -1932,9 +1932,9 @@ define @vror_vx_nxv8i64( %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 +; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v16, v8, a1 -; CHECK-RV64-NEXT: negw a0, a0 ; CHECK-RV64-NEXT: andi a0, a0, 63 ; CHECK-RV64-NEXT: vsll.vx v8, v8, a0 ; CHECK-RV64-NEXT: vor.vv v8, v16, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll index d71fa33cf1f23..12c439346e356 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll @@ -1353,9 +1353,9 @@ define @vsadd_vi_nxv32i32( %va, @vsaddu_vx_nxv8i7( %a, i7 signext %b, ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t @@ -1352,9 +1352,9 @@ define @vsaddu_vi_nxv32i32( %va, @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; NO_FOLDING-NEXT: vlm.v v9, (a1) ; NO_FOLDING-NEXT: vlm.v v10, (a2) ; NO_FOLDING-NEXT: vmv.v.i v11, 0 +; NO_FOLDING-NEXT: li a0, 1 ; NO_FOLDING-NEXT: vmv.v.v v0, v8 ; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 ; NO_FOLDING-NEXT: vmv.v.v v0, v9 @@ -163,7 +164,6 @@ define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; NO_FOLDING-NEXT: vmv.v.v v0, v10 ; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 ; NO_FOLDING-NEXT: vmul.vv v9, v12, v9 -; NO_FOLDING-NEXT: li a0, 1 ; NO_FOLDING-NEXT: vsub.vv v11, v12, v10 ; NO_FOLDING-NEXT: vmv.v.v v0, v8 ; NO_FOLDING-NEXT: vsub.vx v10, v10, a0, v0.t @@ -178,6 +178,7 @@ define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; FOLDING-NEXT: vlm.v v9, (a1) ; FOLDING-NEXT: vlm.v v10, (a2) ; FOLDING-NEXT: vmv.v.i v11, 0 +; FOLDING-NEXT: li a0, 1 ; FOLDING-NEXT: vmv.v.v v0, v8 ; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 ; FOLDING-NEXT: vmv.v.v v0, v9 @@ -185,7 +186,6 @@ define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; FOLDING-NEXT: vmv.v.v v0, v10 ; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 ; FOLDING-NEXT: vmul.vv v9, v12, v9 -; FOLDING-NEXT: li a0, 1 ; FOLDING-NEXT: vsub.vv v11, v12, v10 ; FOLDING-NEXT: vmv.v.v v0, v8 ; FOLDING-NEXT: vsub.vx v10, v10, a0, v0.t @@ -214,6 +214,7 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; NO_FOLDING-NEXT: vlm.v v9, (a1) ; NO_FOLDING-NEXT: vlm.v v10, (a2) ; NO_FOLDING-NEXT: vmv.v.i v11, 0 +; NO_FOLDING-NEXT: li a0, 1 ; NO_FOLDING-NEXT: vmv1r.v v0, v8 ; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 ; NO_FOLDING-NEXT: vmv1r.v v0, v9 @@ -221,7 +222,6 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; NO_FOLDING-NEXT: vmv1r.v v0, v10 ; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 ; NO_FOLDING-NEXT: vmul.vv v9, v12, v9 -; NO_FOLDING-NEXT: li a0, 1 ; NO_FOLDING-NEXT: vsub.vv v11, v12, v10 ; NO_FOLDING-NEXT: vmv1r.v v0, v8 ; NO_FOLDING-NEXT: vsub.vx v10, v10, a0, v0.t @@ -236,6 +236,7 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; FOLDING-NEXT: vlm.v v9, (a1) ; FOLDING-NEXT: vlm.v v10, (a2) ; FOLDING-NEXT: vmv.v.i v11, 0 +; FOLDING-NEXT: li a0, 1 ; FOLDING-NEXT: vmv1r.v v0, v8 ; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 ; FOLDING-NEXT: vmv1r.v v0, v9 @@ -243,7 +244,6 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; FOLDING-NEXT: vmv1r.v v0, v10 ; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 ; FOLDING-NEXT: vmul.vv v9, v12, v9 -; FOLDING-NEXT: li a0, 1 ; FOLDING-NEXT: vsub.vv v11, v12, v10 ; FOLDING-NEXT: vmv1r.v v0, v8 ; FOLDING-NEXT: vsub.vx v10, v10, a0, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll index 1f1a62f57664f..1fc33dc73a27d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll @@ -493,17 +493,16 @@ define @vfmerge_nzv_nxv8f64( %va, @vselect_combine_regression( %va, %vb) { ; CHECK-LABEL: vselect_combine_regression: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; CHECK-NEXT: vmseq.vi v24, v16, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmseq.vi v7, v24, 0 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vle64.v v8, (a0), v0.t -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vle64.v v16, (a1), v0.t ; CHECK-NEXT: ret %cond = icmp eq %va, zeroinitializer @@ -516,11 +515,13 @@ define void @vselect_legalize_regression( %a, %a, %ma, %mb diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll index 6483a99fe3632..bb51f0592dc17 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -354,51 +354,30 @@ define @select_nxv32i32( %a, @select_nxv32i32( %a, @select_evl_nxv32i32( %a, @select_evl_nxv32i32( %a, @select_nxv16f64( %a, This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: vle64.v v9, (a3) +; CHECK-NEXT: add a4, a4, a6 +; CHECK-NEXT: add a3, a3, a5 ; CHECK-NEXT: vfadd.vv v8, v8, v9 ; CHECK-NEXT: vse64.v v8, (a1) -; CHECK-NEXT: add a4, a4, a6 ; CHECK-NEXT: add a1, a1, a5 -; CHECK-NEXT: add a3, a3, a5 ; CHECK-NEXT: add a2, a2, a5 ; CHECK-NEXT: blt a4, a0, .LBB12_2 ; CHECK-NEXT: .LBB12_3: # %for.end diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 7f01fd4d945c6..b0cb6bc6125dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -111,12 +111,12 @@ define void @test6(ptr nocapture readonly %A, ptr nocapture %B, i64 %n) { ; CHECK-NEXT: slli a4, a3, 2 ; CHECK-NEXT: add a5, a0, a4 ; CHECK-NEXT: vle32.v v8, (a5) +; CHECK-NEXT: add a3, a3, a2 ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vse32.v v8, (a4), v0.t -; CHECK-NEXT: add a3, a3, a2 ; CHECK-NEXT: vsetvli a2, a2, e32, m1, ta, ma ; CHECK-NEXT: bnez a2, .LBB5_2 ; CHECK-NEXT: .LBB5_3: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll index 9b5a1a54ad5df..d3b905ef897b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll @@ -154,9 +154,9 @@ define @vsext_nxv32i8_nxv32i32( %a, @llvm.vp.sitofp.nxv32f16.nxv32i32( @vsitofp_nxv32f16_nxv32i32( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vsitofp_nxv32f16_nxv32i32: ; ZVFH: # %bb.0: -; ZVFH-NEXT: addi sp, sp, -16 -; ZVFH-NEXT: .cfi_def_cfa_offset 16 -; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 3 -; ZVFH-NEXT: sub sp, sp, a1 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; ZVFH-NEXT: vmv1r.v v7, v0 -; ZVFH-NEXT: addi a1, sp, 16 -; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv1r.v v24, v0 ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: srli a2, a1, 2 +; ZVFH-NEXT: slli a1, a1, 1 ; ZVFH-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; ZVFH-NEXT: vslidedown.vx v0, v0, a2 -; ZVFH-NEXT: slli a1, a1, 1 ; ZVFH-NEXT: sub a2, a0, a1 ; ZVFH-NEXT: sltu a3, a0, a2 ; ZVFH-NEXT: addi a3, a3, -1 ; ZVFH-NEXT: and a2, a3, a2 -; ZVFH-NEXT: addi a3, sp, 16 -; ZVFH-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; ZVFH-NEXT: vfncvt.f.x.w v20, v24, v0.t +; ZVFH-NEXT: vfncvt.f.x.w v28, v16, v0.t ; ZVFH-NEXT: bltu a0, a1, .LBB34_2 ; ZVFH-NEXT: # %bb.1: ; ZVFH-NEXT: mv a0, a1 ; ZVFH-NEXT: .LBB34_2: -; ZVFH-NEXT: vmv1r.v v0, v7 +; ZVFH-NEXT: vmv1r.v v0, v24 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFH-NEXT: vfncvt.f.x.w v16, v8, v0.t -; ZVFH-NEXT: vmv8r.v v8, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: .cfi_def_cfa sp, 16 -; ZVFH-NEXT: addi sp, sp, 16 -; ZVFH-NEXT: .cfi_def_cfa_offset 0 +; ZVFH-NEXT: vfncvt.f.x.w v24, v8, v0.t +; ZVFH-NEXT: vmv8r.v v8, v24 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vsitofp_nxv32f16_nxv32i32: @@ -551,9 +535,9 @@ define @vsitofp_nxv32f16_nxv32i32( %va, ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: srli a2, a1, 2 +; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 @@ -585,9 +569,9 @@ define @vsitofp_nxv32f32_nxv32i32( %va, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll index cff8cc710d21f..208063bfd2342 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll @@ -11,11 +11,11 @@ define @vsra_vx_nxv8i7( %a, i7 signext %b, @vsrl_vx_nxv8i7( %a, i7 signext %b, @vssub_vi_nxv32i32( %va, @vssubu_vx_nxv8i7( %a, i7 signext %b, ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v8, v8, a2 ; CHECK-NEXT: vand.vx v9, v9, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma ; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t @@ -1393,9 +1393,9 @@ define @vssubu_vi_nxv32i32( %va, @vtrunc_nxv15i16_nxv15i64( %a, @vtrunc_nxv32i7_nxv32i32( %a, @vtrunc_nxv32i8_nxv32i32( %a, @vtrunc_nxv32i64_nxv32i32( %a, @llvm.vp.uitofp.nxv32f16.nxv32i32( @vuitofp_nxv32f16_nxv32i32( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vuitofp_nxv32f16_nxv32i32: ; ZVFH: # %bb.0: -; ZVFH-NEXT: addi sp, sp, -16 -; ZVFH-NEXT: .cfi_def_cfa_offset 16 -; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 3 -; ZVFH-NEXT: sub sp, sp, a1 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; ZVFH-NEXT: vmv1r.v v7, v0 -; ZVFH-NEXT: addi a1, sp, 16 -; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv1r.v v24, v0 ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: srli a2, a1, 2 +; ZVFH-NEXT: slli a1, a1, 1 ; ZVFH-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; ZVFH-NEXT: vslidedown.vx v0, v0, a2 -; ZVFH-NEXT: slli a1, a1, 1 ; ZVFH-NEXT: sub a2, a0, a1 ; ZVFH-NEXT: sltu a3, a0, a2 ; ZVFH-NEXT: addi a3, a3, -1 ; ZVFH-NEXT: and a2, a3, a2 -; ZVFH-NEXT: addi a3, sp, 16 -; ZVFH-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; ZVFH-NEXT: vfncvt.f.xu.w v20, v24, v0.t +; ZVFH-NEXT: vfncvt.f.xu.w v28, v16, v0.t ; ZVFH-NEXT: bltu a0, a1, .LBB34_2 ; ZVFH-NEXT: # %bb.1: ; ZVFH-NEXT: mv a0, a1 ; ZVFH-NEXT: .LBB34_2: -; ZVFH-NEXT: vmv1r.v v0, v7 +; ZVFH-NEXT: vmv1r.v v0, v24 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFH-NEXT: vfncvt.f.xu.w v16, v8, v0.t -; ZVFH-NEXT: vmv8r.v v8, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: .cfi_def_cfa sp, 16 -; ZVFH-NEXT: addi sp, sp, 16 -; ZVFH-NEXT: .cfi_def_cfa_offset 0 +; ZVFH-NEXT: vfncvt.f.xu.w v24, v8, v0.t +; ZVFH-NEXT: vmv8r.v v8, v24 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vuitofp_nxv32f16_nxv32i32: @@ -543,9 +527,9 @@ define @vuitofp_nxv32f16_nxv32i32( %va, ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: srli a2, a1, 2 +; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 -; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: sltu a3, a0, a2 ; ZVFHMIN-NEXT: addi a3, a3, -1 @@ -577,9 +561,9 @@ define @vuitofp_nxv32f32_nxv32i32( %va, ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a2 -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index 336d86d57f3e6..ddc27f7562cdb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1377,9 +1377,9 @@ define @i1_zext( %va, %vb ; ; RV64-LABEL: i1_zext: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV64-NEXT: vadd.vi v8, v8, 1, v0.t ; RV64-NEXT: li a1, 42 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vadd.vi v8, v8, 1, v0.t ; RV64-NEXT: sh a1, 0(a0) ; RV64-NEXT: ret %vc = zext %va to diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll index 3b5541c1a2440..10e655c844540 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll @@ -154,9 +154,9 @@ define @vzext_nxv32i8_nxv32i32( %a, This Inner Loop Header: Depth=1 ; RV64I-NEXT: call bar @@ -324,19 +324,19 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sraw a0, a0, a1 ; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: lui a3, 61681 +; RV64I-NEXT: lui a4, 4112 ; RV64I-NEXT: addiw s0, a1, 1365 +; RV64I-NEXT: addiw s1, a2, 819 +; RV64I-NEXT: addiw s2, a3, -241 +; RV64I-NEXT: addiw s3, a4, 257 ; RV64I-NEXT: slli a1, s0, 32 ; RV64I-NEXT: add s0, s0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw s1, a1, 819 ; RV64I-NEXT: slli a1, s1, 32 ; RV64I-NEXT: add s1, s1, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw s2, a1, -241 ; RV64I-NEXT: slli a1, s2, 32 ; RV64I-NEXT: add s2, s2, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s3, a1, 257 ; RV64I-NEXT: slli a1, s3, 32 ; RV64I-NEXT: add s3, s3, a1 ; RV64I-NEXT: .LBB6_1: # %bb2 @@ -1037,36 +1037,36 @@ define signext i32 @bug(i32 signext %x) { ; CHECK-NEXT: srliw a2, a0, 16 ; CHECK-NEXT: seqz a1, a2 ; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sllw a1, a0, a1 -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: sllw a0, a0, a1 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: beqz a2, .LBB18_3 ; CHECK-NEXT: # %bb.2: # %if.end -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB18_3: # %if.end -; CHECK-NEXT: srliw a2, a1, 24 +; CHECK-NEXT: srliw a2, a0, 24 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 3 -; CHECK-NEXT: sllw a1, a1, a3 ; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -8 -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: srliw a2, a1, 28 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: srliw a2, a0, 28 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 2 -; CHECK-NEXT: sllw a1, a1, a3 ; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -4 -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: srliw a2, a1, 30 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: srliw a2, a0, 30 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 1 -; CHECK-NEXT: sllw a1, a1, a3 ; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -2 -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: not a1, a1 -; CHECK-NEXT: srli a1, a1, 31 -; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: not a0, a0 +; CHECK-NEXT: srli a0, a0, 31 +; CHECK-NEXT: addw a0, a1, a0 ; CHECK-NEXT: .LBB18_4: # %cleanup ; CHECK-NEXT: ret ; @@ -1077,36 +1077,36 @@ define signext i32 @bug(i32 signext %x) { ; NOREMOVAL-NEXT: srliw a2, a0, 16 ; NOREMOVAL-NEXT: seqz a1, a2 ; NOREMOVAL-NEXT: slli a1, a1, 4 -; NOREMOVAL-NEXT: sllw a1, a0, a1 -; NOREMOVAL-NEXT: li a0, 16 +; NOREMOVAL-NEXT: sllw a0, a0, a1 +; NOREMOVAL-NEXT: li a1, 16 ; NOREMOVAL-NEXT: beqz a2, .LBB18_3 ; NOREMOVAL-NEXT: # %bb.2: # %if.end -; NOREMOVAL-NEXT: li a0, 32 +; NOREMOVAL-NEXT: li a1, 32 ; NOREMOVAL-NEXT: .LBB18_3: # %if.end -; NOREMOVAL-NEXT: srliw a2, a1, 24 +; NOREMOVAL-NEXT: srliw a2, a0, 24 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 3 -; NOREMOVAL-NEXT: sllw a1, a1, a3 ; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -8 -; NOREMOVAL-NEXT: add a0, a0, a2 -; NOREMOVAL-NEXT: srliw a2, a1, 28 +; NOREMOVAL-NEXT: add a1, a1, a2 +; NOREMOVAL-NEXT: srliw a2, a0, 28 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 2 -; NOREMOVAL-NEXT: sllw a1, a1, a3 ; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -4 -; NOREMOVAL-NEXT: add a0, a0, a2 -; NOREMOVAL-NEXT: srliw a2, a1, 30 +; NOREMOVAL-NEXT: add a1, a1, a2 +; NOREMOVAL-NEXT: srliw a2, a0, 30 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 1 -; NOREMOVAL-NEXT: sllw a1, a1, a3 ; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -2 -; NOREMOVAL-NEXT: add a0, a0, a2 -; NOREMOVAL-NEXT: not a1, a1 -; NOREMOVAL-NEXT: srli a1, a1, 31 -; NOREMOVAL-NEXT: addw a0, a0, a1 +; NOREMOVAL-NEXT: add a1, a1, a2 +; NOREMOVAL-NEXT: not a0, a0 +; NOREMOVAL-NEXT: srli a0, a0, 31 +; NOREMOVAL-NEXT: addw a0, a1, a0 ; NOREMOVAL-NEXT: .LBB18_4: # %cleanup ; NOREMOVAL-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll index 66a8b85313e9b..1e893d9baa494 100644 --- a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll +++ b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll @@ -250,21 +250,21 @@ define i64 @ashr_by_masked_complemented_64(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a2, 63 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: andi a3, a2, 63 -; RV32I-NEXT: addi a4, a3, -32 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltz a4, .LBB9_2 +; RV32I-NEXT: andi a2, a2, 63 +; RV32I-NEXT: addi a3, a2, -32 +; RV32I-NEXT: bltz a3, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a1, a2, 31 -; RV32I-NEXT: sra a0, a2, a3 +; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: sra a0, a0, a2 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: not a4, a0 -; RV32I-NEXT: sra a1, a2, a4 -; RV32I-NEXT: srl a0, a0, a4 -; RV32I-NEXT: not a3, a3 -; RV32I-NEXT: slli a2, a2, 1 -; RV32I-NEXT: sll a2, a2, a3 +; RV32I-NEXT: not a3, a0 +; RV32I-NEXT: not a2, a2 +; RV32I-NEXT: slli a4, a1, 1 +; RV32I-NEXT: sra a1, a1, a3 +; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: sll a2, a4, a2 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/shift-and.ll b/llvm/test/CodeGen/RISCV/shift-and.ll index 525ef624179c6..c9efeea980f5a 100644 --- a/llvm/test/CodeGen/RISCV/shift-and.ll +++ b/llvm/test/CodeGen/RISCV/shift-and.ll @@ -67,8 +67,8 @@ define i64 @test4(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a2, a1, 26 ; RV32I-NEXT: srli a0, a0, 6 -; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: srli a1, a1, 6 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: lui a2, 1048572 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: ret @@ -105,8 +105,8 @@ define i64 @test6(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 26 ; RV32I-NEXT: slli a1, a1, 6 -; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: srli a0, a0, 10 +; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 2bec1fca5c43b..249dabba0cc28 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -161,37 +161,37 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: mv a6, sp ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: srli a1, a2, 3 +; RV32I-NEXT: andi a3, a2, 31 ; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: add a1, a6, a1 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: slli a6, a4, 1 -; RV32I-NEXT: andi a7, a2, 31 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a3, a3, a6 ; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: slli a6, a5, 1 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli a6, a1, 1 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: slli t0, a6, 1 +; RV32I-NEXT: srl a6, a6, a2 +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: sll a7, a7, a3 +; RV32I-NEXT: sll t0, t0, a3 +; RV32I-NEXT: sll a1, a1, a3 +; RV32I-NEXT: or a3, a4, a7 +; RV32I-NEXT: or a4, a5, t0 +; RV32I-NEXT: or a1, a6, a1 ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) -; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -226,42 +226,42 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: mv a6, sp ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: srli a3, a2, 3 +; RV32I-NEXT: andi a4, a2, 31 ; RV32I-NEXT: sw a1, 16(sp) ; RV32I-NEXT: sw a1, 20(sp) ; RV32I-NEXT: sw a1, 24(sp) ; RV32I-NEXT: sw a1, 28(sp) -; RV32I-NEXT: srli a1, a2, 3 -; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) -; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: slli a6, a4, 1 -; RV32I-NEXT: andi a7, a2, 31 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: andi a3, a3, 12 +; RV32I-NEXT: xori a1, a4, 31 +; RV32I-NEXT: add a3, a6, a3 +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: lw a5, 4(a3) +; RV32I-NEXT: lw a6, 8(a3) +; RV32I-NEXT: lw a3, 12(a3) ; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: slli a6, a5, 1 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli a6, a1, 1 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: slli t0, a6, 1 +; RV32I-NEXT: srl a6, a6, a2 +; RV32I-NEXT: sra a2, a3, a2 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: sll a7, a7, a1 +; RV32I-NEXT: sll t0, t0, a1 +; RV32I-NEXT: sll a1, a3, a1 +; RV32I-NEXT: or a3, a4, a7 +; RV32I-NEXT: or a4, a5, t0 +; RV32I-NEXT: or a1, a6, a1 ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a4, 4(a0) -; RV32I-NEXT: sw a5, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -300,37 +300,37 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: addi a6, sp, 16 ; RV32I-NEXT: sw a3, 16(sp) ; RV32I-NEXT: sw a4, 20(sp) ; RV32I-NEXT: sw a5, 24(sp) ; RV32I-NEXT: sw a1, 28(sp) ; RV32I-NEXT: srli a1, a2, 3 +; RV32I-NEXT: andi a3, a2, 31 ; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a3, a3, a1 -; RV32I-NEXT: lw a1, 4(a3) -; RV32I-NEXT: lw a4, 0(a3) -; RV32I-NEXT: lw a5, 8(a3) -; RV32I-NEXT: lw a3, 12(a3) -; RV32I-NEXT: sll a6, a1, a2 -; RV32I-NEXT: srli a7, a4, 1 -; RV32I-NEXT: andi t0, a2, 31 -; RV32I-NEXT: xori t0, t0, 31 -; RV32I-NEXT: srl a7, a7, t0 -; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: sub a1, a6, a1 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sll a7, a5, a2 -; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: srl a1, a1, t0 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: srli t0, a4, 1 +; RV32I-NEXT: sll a1, a1, a2 +; RV32I-NEXT: sll a4, a4, a2 +; RV32I-NEXT: sll a2, a6, a2 ; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: srl a5, a5, t0 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: sll a2, a4, a2 -; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: srli a6, a6, 1 +; RV32I-NEXT: srl t0, t0, a3 +; RV32I-NEXT: srl a5, a5, a3 +; RV32I-NEXT: srl a3, a6, a3 +; RV32I-NEXT: or a6, a7, t0 +; RV32I-NEXT: or a2, a2, a5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw a4, 0(a0) ; RV32I-NEXT: sw a6, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -359,24 +359,24 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV32I-LABEL: fshr64_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a4, a2, 32 +; RV32I-NEXT: andi a5, a2, 32 ; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: beqz a4, .LBB9_2 +; RV32I-NEXT: beqz a5, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a5, a3, a2 -; RV32I-NEXT: beqz a4, .LBB9_4 +; RV32I-NEXT: srl a4, a3, a2 +; RV32I-NEXT: beqz a5, .LBB9_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: .LBB9_4: ; RV32I-NEXT: slli a0, a1, 1 -; RV32I-NEXT: not a4, a2 -; RV32I-NEXT: sll a0, a0, a4 -; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: not a5, a2 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a2, a3, a4 +; RV32I-NEXT: sll a0, a0, a5 +; RV32I-NEXT: sll a2, a3, a5 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; @@ -395,90 +395,90 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t2, 0(a1) +; RV32I-NEXT: lw t1, 0(a1) ; RV32I-NEXT: lw a7, 4(a1) -; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: andi t1, a2, 64 +; RV32I-NEXT: andi t2, a2, 64 ; RV32I-NEXT: mv t0, a7 -; RV32I-NEXT: mv a4, t2 -; RV32I-NEXT: beqz t1, .LBB10_2 +; RV32I-NEXT: mv a3, t1 +; RV32I-NEXT: beqz t2, .LBB10_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv t0, a1 -; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: .LBB10_2: ; RV32I-NEXT: andi a6, a2, 32 -; RV32I-NEXT: mv a5, a4 +; RV32I-NEXT: mv a5, a3 ; RV32I-NEXT: bnez a6, .LBB10_13 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: bnez t1, .LBB10_14 +; RV32I-NEXT: bnez t2, .LBB10_14 ; RV32I-NEXT: .LBB10_4: ; RV32I-NEXT: beqz a6, .LBB10_6 ; RV32I-NEXT: .LBB10_5: -; RV32I-NEXT: mv t0, a3 +; RV32I-NEXT: mv t0, a4 ; RV32I-NEXT: .LBB10_6: ; RV32I-NEXT: slli t3, t0, 1 -; RV32I-NEXT: not t2, a2 -; RV32I-NEXT: beqz t1, .LBB10_8 +; RV32I-NEXT: not t1, a2 +; RV32I-NEXT: beqz t2, .LBB10_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv a1, a7 ; RV32I-NEXT: .LBB10_8: ; RV32I-NEXT: srl a7, a5, a2 -; RV32I-NEXT: sll t1, t3, t2 +; RV32I-NEXT: sll t2, t3, t1 ; RV32I-NEXT: srl t0, t0, a2 ; RV32I-NEXT: beqz a6, .LBB10_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a4, a1 ; RV32I-NEXT: .LBB10_10: -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t1, t1, t2 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: srl a3, a3, a2 +; RV32I-NEXT: or a7, t2, a7 +; RV32I-NEXT: slli t2, a4, 1 +; RV32I-NEXT: sll t2, t2, t1 +; RV32I-NEXT: or t0, t2, t0 +; RV32I-NEXT: srl a4, a4, a2 ; RV32I-NEXT: beqz a6, .LBB10_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: .LBB10_12: -; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sll a4, a4, t2 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a3, a1, 1 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: sll a2, a5, t2 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: sll a2, a3, t1 +; RV32I-NEXT: sll a3, a5, t1 +; RV32I-NEXT: or a2, a2, a4 +; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: sw a7, 0(a0) ; RV32I-NEXT: sw t0, 4(a0) -; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a2, 8(a0) ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB10_13: ; RV32I-NEXT: mv a5, t0 -; RV32I-NEXT: beqz t1, .LBB10_4 +; RV32I-NEXT: beqz t2, .LBB10_4 ; RV32I-NEXT: .LBB10_14: -; RV32I-NEXT: mv a3, t2 +; RV32I-NEXT: mv a4, t1 ; RV32I-NEXT: bnez a6, .LBB10_5 ; RV32I-NEXT: j .LBB10_6 ; ; RV64I-LABEL: fshr128_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: andi a4, a2, 64 +; RV64I-NEXT: andi a5, a2, 64 ; RV64I-NEXT: mv a3, a0 -; RV64I-NEXT: beqz a4, .LBB10_2 +; RV64I-NEXT: beqz a5, .LBB10_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a3, a1 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: srl a5, a3, a2 -; RV64I-NEXT: beqz a4, .LBB10_4 +; RV64I-NEXT: srl a4, a3, a2 +; RV64I-NEXT: beqz a5, .LBB10_4 ; RV64I-NEXT: # %bb.3: ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: .LBB10_4: ; RV64I-NEXT: slli a0, a1, 1 -; RV64I-NEXT: not a4, a2 -; RV64I-NEXT: sll a0, a0, a4 -; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: not a5, a2 ; RV64I-NEXT: srl a1, a1, a2 ; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a2, a3, a4 +; RV64I-NEXT: sll a0, a0, a5 +; RV64I-NEXT: sll a2, a3, a5 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: ret %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b) diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll index 64be997c191be..500673cc29196 100644 --- a/llvm/test/CodeGen/RISCV/shl-cttz.ll +++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll @@ -125,18 +125,18 @@ define i16 @shl_cttz_i16(i16 %x, i16 %y) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: addi a2, a1, -1 ; RV32I-NEXT: not a1, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: srli a2, a1, 1 ; RV32I-NEXT: lui a3, 5 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: addi a2, a3, 1365 +; RV32I-NEXT: srli a3, a1, 1 +; RV32I-NEXT: and a2, a3, a2 +; RV32I-NEXT: lui a3, 3 +; RV32I-NEXT: addi a3, a3, 819 ; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: lui a2, 3 -; RV32I-NEXT: addi a2, a2, 819 -; RV32I-NEXT: and a3, a1, a2 +; RV32I-NEXT: and a2, a1, a3 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: srli a2, a1, 4 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: andi a2, a1, 15 @@ -156,18 +156,18 @@ define i16 @shl_cttz_i16(i16 %x, i16 %y) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: addi a2, a1, -1 ; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: srli a2, a1, 1 ; RV64I-NEXT: lui a3, 5 -; RV64I-NEXT: addiw a3, a3, 1365 -; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: addiw a2, a3, 1365 +; RV64I-NEXT: srli a3, a1, 1 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: lui a3, 3 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: lui a2, 3 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: and a2, a1, a3 ; RV64I-NEXT: srli a1, a1, 2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: srli a2, a1, 4 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: andi a2, a1, 15 @@ -193,18 +193,18 @@ define i16 @shl_cttz_constant_i16(i16 %y) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: addi a1, a0, -1 ; RV32I-NEXT: not a0, a0 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 5 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: addi a1, a2, 1365 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 3 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a2, a0, a1 +; RV32I-NEXT: and a1, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: andi a1, a0, 15 @@ -226,18 +226,18 @@ define i16 @shl_cttz_constant_i16(i16 %y) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: addi a1, a0, -1 ; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 5 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: addiw a1, a2, 1365 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: lui a2, 3 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 3 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 +; RV64I-NEXT: and a1, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 15 diff --git a/llvm/test/CodeGen/RISCV/shlimm-addimm.ll b/llvm/test/CodeGen/RISCV/shlimm-addimm.ll index ead71bcbe113c..c842ba5da5208 100644 --- a/llvm/test/CodeGen/RISCV/shlimm-addimm.ll +++ b/llvm/test/CodeGen/RISCV/shlimm-addimm.ll @@ -48,10 +48,10 @@ define i64 @shl5_add1184_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 -; RV32I-NEXT: addi a0, a2, 1184 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: addi a0, a3, 1184 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -112,12 +112,12 @@ define i64 @shl5_add101024_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 ; RV32I-NEXT: lui a0, 25 ; RV32I-NEXT: addi a0, a0, -1376 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -180,12 +180,12 @@ define i64 @shl5_add47968_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 ; RV32I-NEXT: lui a0, 12 ; RV32I-NEXT: addi a0, a0, -1184 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -248,12 +248,12 @@ define i64 @shl5_add47969_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 ; RV32I-NEXT: lui a0, 12 ; RV32I-NEXT: addi a0, a0, -1183 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -316,12 +316,12 @@ define i64 @shl5_sub47968_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 ; RV32I-NEXT: lui a0, 1048564 ; RV32I-NEXT: addi a0, a0, 1184 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret @@ -385,12 +385,12 @@ define i64 @shl5_sub47969_c(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a0, 27 ; RV32I-NEXT: slli a1, a1, 5 +; RV32I-NEXT: slli a3, a0, 5 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a2, a0, 5 ; RV32I-NEXT: lui a0, 1048564 ; RV32I-NEXT: addi a0, a0, 1183 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll index c0c11fefafb55..b7b88584f3bdb 100644 --- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll @@ -69,39 +69,39 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si ; ; RV64SFB-LABEL: test3: ; RV64SFB: # %bb.0: -; RV64SFB-NEXT: bnez a4, .LBB2_2 +; RV64SFB-NEXT: beqz a4, .LBB2_2 ; RV64SFB-NEXT: # %bb.1: -; RV64SFB-NEXT: mv a0, a1 +; RV64SFB-NEXT: mv a2, a3 ; RV64SFB-NEXT: .LBB2_2: -; RV64SFB-NEXT: beqz a4, .LBB2_4 +; RV64SFB-NEXT: bnez a4, .LBB2_4 ; RV64SFB-NEXT: # %bb.3: -; RV64SFB-NEXT: mv a2, a3 +; RV64SFB-NEXT: mv a0, a1 ; RV64SFB-NEXT: .LBB2_4: ; RV64SFB-NEXT: addw a0, a0, a2 ; RV64SFB-NEXT: ret ; ; ZICOND-LABEL: test3: ; ZICOND: # %bb.0: -; ZICOND-NEXT: bnez a4, .LBB2_2 +; ZICOND-NEXT: beqz a4, .LBB2_2 ; ZICOND-NEXT: # %bb.1: -; ZICOND-NEXT: mv a0, a1 +; ZICOND-NEXT: mv a2, a3 ; ZICOND-NEXT: .LBB2_2: -; ZICOND-NEXT: beqz a4, .LBB2_4 +; ZICOND-NEXT: bnez a4, .LBB2_4 ; ZICOND-NEXT: # %bb.3: -; ZICOND-NEXT: mv a2, a3 +; ZICOND-NEXT: mv a0, a1 ; ZICOND-NEXT: .LBB2_4: ; ZICOND-NEXT: addw a0, a0, a2 ; ZICOND-NEXT: ret ; ; RV32SFB-LABEL: test3: ; RV32SFB: # %bb.0: -; RV32SFB-NEXT: bnez a4, .LBB2_2 +; RV32SFB-NEXT: beqz a4, .LBB2_2 ; RV32SFB-NEXT: # %bb.1: -; RV32SFB-NEXT: mv a0, a1 +; RV32SFB-NEXT: mv a2, a3 ; RV32SFB-NEXT: .LBB2_2: -; RV32SFB-NEXT: beqz a4, .LBB2_4 +; RV32SFB-NEXT: bnez a4, .LBB2_4 ; RV32SFB-NEXT: # %bb.3: -; RV32SFB-NEXT: mv a2, a3 +; RV32SFB-NEXT: mv a0, a1 ; RV32SFB-NEXT: .LBB2_4: ; RV32SFB-NEXT: add a0, a0, a2 ; RV32SFB-NEXT: ret @@ -566,18 +566,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; RV64SFB-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFB-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64SFB-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64SFB-NEXT: mv s0, a1 +; RV64SFB-NEXT: mv s1, a1 ; RV64SFB-NEXT: andi a0, a0, 1 -; RV64SFB-NEXT: mv s1, a2 +; RV64SFB-NEXT: mv s0, a2 ; RV64SFB-NEXT: beqz a0, .LBB16_4 ; RV64SFB-NEXT: # %bb.3: # %bb -; RV64SFB-NEXT: addw s0, a1, a3 +; RV64SFB-NEXT: addw s1, a1, a3 ; RV64SFB-NEXT: .LBB16_4: # %bb ; RV64SFB-NEXT: .LBB16_1: # %bb2 ; RV64SFB-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64SFB-NEXT: mv a0, s0 +; RV64SFB-NEXT: mv a0, s1 ; RV64SFB-NEXT: call bar -; RV64SFB-NEXT: sllw s0, s0, s1 +; RV64SFB-NEXT: sllw s1, s1, s0 ; RV64SFB-NEXT: bnez a0, .LBB16_1 ; RV64SFB-NEXT: # %bb.2: # %bb7 ; RV64SFB-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -592,18 +592,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; ZICOND-NEXT: mv s0, a1 +; ZICOND-NEXT: mv s1, a1 ; ZICOND-NEXT: andi a0, a0, 1 -; ZICOND-NEXT: mv s1, a2 +; ZICOND-NEXT: mv s0, a2 ; ZICOND-NEXT: beqz a0, .LBB16_4 ; ZICOND-NEXT: # %bb.3: # %bb -; ZICOND-NEXT: addw s0, a1, a3 +; ZICOND-NEXT: addw s1, a1, a3 ; ZICOND-NEXT: .LBB16_4: # %bb ; ZICOND-NEXT: .LBB16_1: # %bb2 ; ZICOND-NEXT: # =>This Inner Loop Header: Depth=1 -; ZICOND-NEXT: mv a0, s0 +; ZICOND-NEXT: mv a0, s1 ; ZICOND-NEXT: call bar -; ZICOND-NEXT: sllw s0, s0, s1 +; ZICOND-NEXT: sllw s1, s1, s0 ; ZICOND-NEXT: bnez a0, .LBB16_1 ; ZICOND-NEXT: # %bb.2: # %bb7 ; ZICOND-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -618,18 +618,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32SFB-NEXT: mv s0, a1 +; RV32SFB-NEXT: mv s1, a1 ; RV32SFB-NEXT: andi a0, a0, 1 -; RV32SFB-NEXT: mv s1, a2 +; RV32SFB-NEXT: mv s0, a2 ; RV32SFB-NEXT: beqz a0, .LBB16_4 ; RV32SFB-NEXT: # %bb.3: # %bb -; RV32SFB-NEXT: add s0, a1, a3 +; RV32SFB-NEXT: add s1, a1, a3 ; RV32SFB-NEXT: .LBB16_4: # %bb ; RV32SFB-NEXT: .LBB16_1: # %bb2 ; RV32SFB-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32SFB-NEXT: mv a0, s0 +; RV32SFB-NEXT: mv a0, s1 ; RV32SFB-NEXT: call bar -; RV32SFB-NEXT: sll s0, s0, s1 +; RV32SFB-NEXT: sll s1, s1, s0 ; RV32SFB-NEXT: bnez a0, .LBB16_1 ; RV32SFB-NEXT: # %bb.2: # %bb7 ; RV32SFB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1223,14 +1223,14 @@ define i64 @select_slli(i64 %A, i64 %C, i1 zeroext %cond) { ; RV32SFB-LABEL: select_slli: ; RV32SFB: # %bb.0: # %entry ; RV32SFB-NEXT: mv a1, a0 -; RV32SFB-NEXT: mv a0, a2 -; RV32SFB-NEXT: beqz a4, .LBB28_2 +; RV32SFB-NEXT: bnez a4, .LBB28_2 ; RV32SFB-NEXT: # %bb.1: # %entry -; RV32SFB-NEXT: mv a1, a3 +; RV32SFB-NEXT: li a2, 0 ; RV32SFB-NEXT: .LBB28_2: # %entry -; RV32SFB-NEXT: bnez a4, .LBB28_4 +; RV32SFB-NEXT: mv a0, a2 +; RV32SFB-NEXT: beqz a4, .LBB28_4 ; RV32SFB-NEXT: # %bb.3: # %entry -; RV32SFB-NEXT: li a0, 0 +; RV32SFB-NEXT: mv a1, a3 ; RV32SFB-NEXT: .LBB28_4: # %entry ; RV32SFB-NEXT: ret entry: @@ -1567,11 +1567,11 @@ define i64 @select_andn(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) { ; RV32SFB: # %bb.0: # %entry ; RV32SFB-NEXT: bnez a6, .LBB36_2 ; RV32SFB-NEXT: # %bb.1: # %entry -; RV32SFB-NEXT: andn a4, a0, a2 +; RV32SFB-NEXT: andn a5, a1, a3 ; RV32SFB-NEXT: .LBB36_2: # %entry ; RV32SFB-NEXT: bnez a6, .LBB36_4 ; RV32SFB-NEXT: # %bb.3: # %entry -; RV32SFB-NEXT: andn a5, a1, a3 +; RV32SFB-NEXT: andn a4, a0, a2 ; RV32SFB-NEXT: .LBB36_4: # %entry ; RV32SFB-NEXT: mv a0, a4 ; RV32SFB-NEXT: mv a1, a5 @@ -1615,11 +1615,11 @@ define i64 @select_orn(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) { ; RV32SFB: # %bb.0: # %entry ; RV32SFB-NEXT: bnez a6, .LBB37_2 ; RV32SFB-NEXT: # %bb.1: # %entry -; RV32SFB-NEXT: orn a4, a0, a2 +; RV32SFB-NEXT: orn a5, a1, a3 ; RV32SFB-NEXT: .LBB37_2: # %entry ; RV32SFB-NEXT: bnez a6, .LBB37_4 ; RV32SFB-NEXT: # %bb.3: # %entry -; RV32SFB-NEXT: orn a5, a1, a3 +; RV32SFB-NEXT: orn a4, a0, a2 ; RV32SFB-NEXT: .LBB37_4: # %entry ; RV32SFB-NEXT: mv a0, a4 ; RV32SFB-NEXT: mv a1, a5 @@ -1663,11 +1663,11 @@ define i64 @select_xnor(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) { ; RV32SFB: # %bb.0: # %entry ; RV32SFB-NEXT: bnez a6, .LBB38_2 ; RV32SFB-NEXT: # %bb.1: # %entry -; RV32SFB-NEXT: xnor a4, a0, a2 +; RV32SFB-NEXT: xnor a5, a1, a3 ; RV32SFB-NEXT: .LBB38_2: # %entry ; RV32SFB-NEXT: bnez a6, .LBB38_4 ; RV32SFB-NEXT: # %bb.3: # %entry -; RV32SFB-NEXT: xnor a5, a1, a3 +; RV32SFB-NEXT: xnor a4, a0, a2 ; RV32SFB-NEXT: .LBB38_4: # %entry ; RV32SFB-NEXT: mv a0, a4 ; RV32SFB-NEXT: mv a1, a5 diff --git a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll index 54b85fab757ca..d43dfd46d62fc 100644 --- a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll +++ b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll @@ -25,8 +25,8 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind { ; RV32I-LABEL: shifts_eqcmp_i16_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: srli a1, a1, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srli a1, a1, 16 ; RV32I-NEXT: srai a0, a0, 8 ; RV32I-NEXT: srli a0, a0, 16 ; RV32I-NEXT: xor a0, a0, a1 @@ -36,8 +36,8 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind { ; RV64I-LABEL: shifts_eqcmp_i16_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 48 -; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: srai a0, a0, 8 ; RV64I-NEXT: srli a0, a0, 48 ; RV64I-NEXT: xor a0, a0, a1 @@ -459,10 +459,10 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi a2, a0, -128 ; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sltiu a2, a2, -256 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: seqz a0, a0 -; RV32I-NEXT: sltiu a1, a2, -256 -; RV32I-NEXT: xori a1, a1, 1 +; RV32I-NEXT: xori a1, a2, 1 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: ret ; @@ -794,9 +794,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { ; RV32I-LABEL: add_ultcmp_bad_i16_i8_cmp: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a0, a0, 128 ; RV32I-NEXT: addi a2, a2, -1 ; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: addi a0, a0, 128 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: sltu a0, a0, a1 ; RV32I-NEXT: ret @@ -804,9 +804,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { ; RV64I-LABEL: add_ultcmp_bad_i16_i8_cmp: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addi a0, a0, 128 ; RV64I-NEXT: addiw a2, a2, -1 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: addi a0, a0, 128 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: sltu a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 284b559eeec5f..8f5b044c3b3b8 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -11,32 +11,32 @@ define void @test1(ptr %sp, ptr %t, i32 %n) { ; RV32I-LABEL: test1: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a2, 20 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: li a3, 1 -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a3, 0(a1) -; RV32I-NEXT: sw a2, 4(a1) +; RV32I-NEXT: li a2, 1 +; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: sw a3, 4(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: lui a2, 20 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: addiw a2, a2, -1920 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: li a2, 2 -; RV64I-NEXT: li a3, 1 -; RV64I-NEXT: sw a2, 0(a0) -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a3, 0(a1) -; RV64I-NEXT: sw a2, 4(a1) +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: sw a3, 0(a0) +; RV64I-NEXT: sw a2, 4(a0) +; RV64I-NEXT: sw a2, 0(a1) +; RV64I-NEXT: sw a3, 4(a1) ; RV64I-NEXT: ret entry: %s = load ptr, ptr %sp @@ -125,23 +125,23 @@ define void @test3(ptr %t) { ; RV32I-LABEL: test3: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 +; RV32I-NEXT: li a2, 2 ; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: li a1, 2 -; RV32I-NEXT: li a2, 3 -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: li a1, 3 +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 8(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, 20 +; RV64I-NEXT: li a2, 2 ; RV64I-NEXT: addiw a1, a1, -1920 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: li a1, 2 -; RV64I-NEXT: li a2, 3 -; RV64I-NEXT: sw a1, 4(a0) -; RV64I-NEXT: sw a2, 8(a0) +; RV64I-NEXT: li a1, 3 +; RV64I-NEXT: sw a2, 4(a0) +; RV64I-NEXT: sw a1, 8(a0) ; RV64I-NEXT: ret entry: %splitgep = getelementptr i8, ptr %t, i64 80000 diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll index 5fa802b7f27ca..83ae03452db5b 100644 --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -8,52 +8,52 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_3: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 699051 -; RV32-NEXT: addi a4, a3, -1365 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: srli a6, a5, 1 -; RV32-NEXT: andi a5, a5, -2 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, -1365 ; RV32-NEXT: addi a3, a3, -1366 -; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: srli a6, a4, 1 +; RV32-NEXT: andi a4, a4, -2 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: sub a2, a2, a4 +; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a4, a3 +; RV32-NEXT: mulhu a3, a4, a5 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a4, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_3: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 699051 +; RV64-NEXT: lui a4, %hi(.LCPI0_0) +; RV64-NEXT: sltu a5, a2, a0 ; RV64-NEXT: addiw a3, a3, -1365 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a4, a2, a3 -; RV64-NEXT: srli a5, a4, 1 -; RV64-NEXT: andi a4, a4, -2 -; RV64-NEXT: lui a6, %hi(.LCPI0_0) -; RV64-NEXT: ld a6, %lo(.LCPI0_0)(a6) -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: sub a2, a2, a4 -; RV64-NEXT: sub a4, a0, a2 -; RV64-NEXT: mul a5, a4, a6 -; RV64-NEXT: mulhu a6, a4, a3 -; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: ld a4, %lo(.LCPI0_0)(a4) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: mulhu a5, a2, a3 +; RV64-NEXT: srli a6, a5, 1 +; RV64-NEXT: andi a5, a5, -2 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a4, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 3 ret iXLen2 %a @@ -63,52 +63,52 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_5: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 838861 -; RV32-NEXT: addi a4, a3, -819 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: srli a6, a5, 2 -; RV32-NEXT: andi a5, a5, -4 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, -819 ; RV32-NEXT: addi a3, a3, -820 -; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: srli a6, a4, 2 +; RV32-NEXT: andi a4, a4, -4 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: sub a2, a2, a4 +; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a4, a3 +; RV32-NEXT: mulhu a3, a4, a5 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a4, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_5: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 838861 +; RV64-NEXT: lui a4, %hi(.LCPI1_0) +; RV64-NEXT: sltu a5, a2, a0 ; RV64-NEXT: addiw a3, a3, -819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a4, a2, a3 -; RV64-NEXT: srli a5, a4, 2 -; RV64-NEXT: andi a4, a4, -4 -; RV64-NEXT: lui a6, %hi(.LCPI1_0) -; RV64-NEXT: ld a6, %lo(.LCPI1_0)(a6) -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: sub a2, a2, a4 -; RV64-NEXT: sub a4, a0, a2 -; RV64-NEXT: mul a5, a4, a6 -; RV64-NEXT: mulhu a6, a4, a3 -; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: ld a4, %lo(.LCPI1_0)(a4) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: mulhu a5, a2, a3 +; RV64-NEXT: srli a6, a5, 2 +; RV64-NEXT: andi a5, a5, -4 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a4, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 5 ret iXLen2 %a @@ -170,57 +170,57 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_15: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 559241 +; RV32-NEXT: lui a4, 978671 +; RV32-NEXT: sltu a5, a2, a0 ; RV32-NEXT: addi a3, a3, -1911 +; RV32-NEXT: addi a6, a4, -274 +; RV32-NEXT: addi a4, a4, -273 +; RV32-NEXT: add a2, a2, a5 ; RV32-NEXT: mulhu a3, a2, a3 ; RV32-NEXT: srli a3, a3, 3 -; RV32-NEXT: slli a4, a3, 4 -; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: slli a5, a3, 4 +; RV32-NEXT: sub a3, a3, a5 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: sub a3, a0, a2 -; RV32-NEXT: lui a4, 978671 -; RV32-NEXT: addi a5, a4, -274 -; RV32-NEXT: mul a5, a3, a5 -; RV32-NEXT: addi a4, a4, -273 -; RV32-NEXT: mulhu a6, a3, a4 -; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a3, a6 +; RV32-NEXT: mulhu a5, a3, a4 ; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: add a2, a5, a2 ; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_15: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 559241 +; RV64-NEXT: lui a4, %hi(.LCPI4_0) +; RV64-NEXT: lui a5, 978671 +; RV64-NEXT: sltu a6, a2, a0 ; RV64-NEXT: addiw a3, a3, -1911 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: ld a4, %lo(.LCPI4_0)(a4) +; RV64-NEXT: addiw a5, a5, -273 +; RV64-NEXT: add a2, a2, a6 +; RV64-NEXT: slli a6, a3, 32 +; RV64-NEXT: add a3, a3, a6 +; RV64-NEXT: slli a6, a5, 32 +; RV64-NEXT: add a5, a5, a6 ; RV64-NEXT: mulhu a3, a2, a3 ; RV64-NEXT: srli a3, a3, 3 -; RV64-NEXT: slli a4, a3, 4 -; RV64-NEXT: lui a5, %hi(.LCPI4_0) -; RV64-NEXT: ld a5, %lo(.LCPI4_0)(a5) -; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: slli a6, a3, 4 +; RV64-NEXT: sub a3, a3, a6 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: sub a3, a0, a2 -; RV64-NEXT: mul a4, a3, a5 -; RV64-NEXT: lui a5, 978671 -; RV64-NEXT: addiw a5, a5, -273 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: mulhu a6, a3, a5 -; RV64-NEXT: add a4, a6, a4 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a3, a4 +; RV64-NEXT: mulhu a4, a3, a5 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a5 -; RV64-NEXT: add a1, a4, a1 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 15 @@ -231,52 +231,52 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_17: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 986895 -; RV32-NEXT: addi a4, a3, 241 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: srli a6, a5, 4 -; RV32-NEXT: andi a5, a5, -16 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, 241 ; RV32-NEXT: addi a3, a3, 240 -; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: srli a6, a4, 4 +; RV32-NEXT: andi a4, a4, -16 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: sub a2, a2, a4 +; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a4, a3 +; RV32-NEXT: mulhu a3, a4, a5 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a4, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_17: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 986895 +; RV64-NEXT: lui a4, %hi(.LCPI5_0) +; RV64-NEXT: sltu a5, a2, a0 ; RV64-NEXT: addiw a3, a3, 241 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a4, a2, a3 -; RV64-NEXT: srli a5, a4, 4 -; RV64-NEXT: andi a4, a4, -16 -; RV64-NEXT: lui a6, %hi(.LCPI5_0) -; RV64-NEXT: ld a6, %lo(.LCPI5_0)(a6) -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: sub a2, a2, a4 -; RV64-NEXT: sub a4, a0, a2 -; RV64-NEXT: mul a5, a4, a6 -; RV64-NEXT: mulhu a6, a4, a3 -; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: ld a4, %lo(.LCPI5_0)(a4) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: mulhu a5, a2, a3 +; RV64-NEXT: srli a6, a5, 4 +; RV64-NEXT: andi a5, a5, -16 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a4, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 17 ret iXLen2 %a @@ -286,57 +286,57 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_255: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 526344 +; RV32-NEXT: lui a4, 1044464 +; RV32-NEXT: sltu a5, a2, a0 ; RV32-NEXT: addi a3, a3, 129 +; RV32-NEXT: addi a6, a4, -258 +; RV32-NEXT: addi a4, a4, -257 +; RV32-NEXT: add a2, a2, a5 ; RV32-NEXT: mulhu a3, a2, a3 ; RV32-NEXT: srli a3, a3, 7 -; RV32-NEXT: slli a4, a3, 8 -; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: slli a5, a3, 8 +; RV32-NEXT: sub a3, a3, a5 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: sub a3, a0, a2 -; RV32-NEXT: lui a4, 1044464 -; RV32-NEXT: addi a5, a4, -258 -; RV32-NEXT: mul a5, a3, a5 -; RV32-NEXT: addi a4, a4, -257 -; RV32-NEXT: mulhu a6, a3, a4 -; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a3, a6 +; RV32-NEXT: mulhu a5, a3, a4 ; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: add a2, a5, a2 ; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_255: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 526344 +; RV64-NEXT: lui a4, %hi(.LCPI6_0) +; RV64-NEXT: lui a5, 1044464 +; RV64-NEXT: sltu a6, a2, a0 ; RV64-NEXT: addiw a3, a3, 129 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: ld a4, %lo(.LCPI6_0)(a4) +; RV64-NEXT: addiw a5, a5, -257 +; RV64-NEXT: add a2, a2, a6 +; RV64-NEXT: slli a6, a3, 32 +; RV64-NEXT: add a3, a3, a6 +; RV64-NEXT: slli a6, a5, 32 +; RV64-NEXT: add a5, a5, a6 ; RV64-NEXT: mulhu a3, a2, a3 ; RV64-NEXT: srli a3, a3, 7 -; RV64-NEXT: slli a4, a3, 8 -; RV64-NEXT: lui a5, %hi(.LCPI6_0) -; RV64-NEXT: ld a5, %lo(.LCPI6_0)(a5) -; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: slli a6, a3, 8 +; RV64-NEXT: sub a3, a3, a6 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: sub a3, a0, a2 -; RV64-NEXT: mul a4, a3, a5 -; RV64-NEXT: lui a5, 1044464 -; RV64-NEXT: addiw a5, a5, -257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: mulhu a6, a3, a5 -; RV64-NEXT: add a4, a6, a4 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a3, a4 +; RV64-NEXT: mulhu a4, a3, a5 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a5 -; RV64-NEXT: add a1, a4, a1 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 255 @@ -347,52 +347,52 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_257: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 1044496 -; RV32-NEXT: addi a4, a3, -255 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: srli a6, a5, 8 -; RV32-NEXT: andi a5, a5, -256 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, -255 ; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: srli a6, a4, 8 +; RV32-NEXT: andi a4, a4, -256 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: sub a2, a2, a4 +; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a4, a3 +; RV32-NEXT: mulhu a3, a4, a5 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a4, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_257: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 1044496 +; RV64-NEXT: lui a4, %hi(.LCPI7_0) +; RV64-NEXT: sltu a5, a2, a0 ; RV64-NEXT: addiw a3, a3, -255 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a4, a2, a3 -; RV64-NEXT: srli a5, a4, 8 -; RV64-NEXT: andi a4, a4, -256 -; RV64-NEXT: lui a6, %hi(.LCPI7_0) -; RV64-NEXT: ld a6, %lo(.LCPI7_0)(a6) -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: sub a2, a2, a4 -; RV64-NEXT: sub a4, a0, a2 -; RV64-NEXT: mul a5, a4, a6 -; RV64-NEXT: mulhu a6, a4, a3 -; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: ld a4, %lo(.LCPI7_0)(a4) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a3, 32 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: mulhu a5, a2, a3 +; RV64-NEXT: srli a6, a5, 8 +; RV64-NEXT: andi a5, a5, -256 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a4, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 257 ret iXLen2 %a @@ -402,63 +402,63 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_65535: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 524296 +; RV32-NEXT: lui a4, 1048560 +; RV32-NEXT: sltu a5, a2, a0 ; RV32-NEXT: addi a3, a3, 1 +; RV32-NEXT: add a2, a2, a5 ; RV32-NEXT: mulhu a3, a2, a3 ; RV32-NEXT: srli a3, a3, 15 -; RV32-NEXT: slli a4, a3, 16 -; RV32-NEXT: sub a3, a3, a4 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: sub a3, a0, a2 -; RV32-NEXT: lui a4, 1048560 +; RV32-NEXT: slli a5, a3, 16 +; RV32-NEXT: sub a3, a3, a5 ; RV32-NEXT: addi a5, a4, -2 -; RV32-NEXT: mul a5, a3, a5 ; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: mulhu a4, a3, a4 -; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a3, a5 +; RV32-NEXT: mulhu a4, a3, a4 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: slli a0, a1, 16 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: sub a1, a4, a0 ; RV32-NEXT: slli a0, a3, 16 -; RV32-NEXT: neg a2, a3 -; RV32-NEXT: sub a0, a2, a0 +; RV32-NEXT: neg a3, a3 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: slli a4, a1, 16 +; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub a0, a3, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_65535: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 524296 -; RV64-NEXT: addiw a3, a3, 1 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a3, a2, a3 -; RV64-NEXT: srli a3, a3, 15 -; RV64-NEXT: slli a4, a3, 16 -; RV64-NEXT: sub a3, a3, a4 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: sub a3, a0, a2 ; RV64-NEXT: lui a4, 983039 -; RV64-NEXT: slli a4, a4, 4 -; RV64-NEXT: addi a4, a4, -1 -; RV64-NEXT: slli a4, a4, 16 -; RV64-NEXT: addi a4, a4, -2 -; RV64-NEXT: mul a4, a3, a4 ; RV64-NEXT: lui a5, 1048560 +; RV64-NEXT: sltu a6, a2, a0 +; RV64-NEXT: addiw a3, a3, 1 +; RV64-NEXT: slli a4, a4, 4 ; RV64-NEXT: addiw a5, a5, -1 +; RV64-NEXT: add a2, a2, a6 +; RV64-NEXT: slli a6, a3, 32 +; RV64-NEXT: addi a4, a4, -1 +; RV64-NEXT: add a3, a3, a6 ; RV64-NEXT: slli a6, a5, 32 +; RV64-NEXT: slli a4, a4, 16 ; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: mulhu a6, a3, a5 -; RV64-NEXT: add a4, a6, a4 +; RV64-NEXT: mulhu a3, a2, a3 +; RV64-NEXT: addi a4, a4, -2 +; RV64-NEXT: srli a3, a3, 15 +; RV64-NEXT: slli a6, a3, 16 +; RV64-NEXT: sub a3, a3, a6 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a3, a0, a2 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a3, a4 +; RV64-NEXT: mulhu a4, a3, a5 ; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 ; RV64-NEXT: mul a1, a1, a5 -; RV64-NEXT: add a1, a4, a1 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65535 @@ -469,54 +469,54 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_65537: ; RV32: # %bb.0: ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 1048560 -; RV32-NEXT: addi a4, a3, 1 -; RV32-NEXT: mulhu a5, a2, a4 -; RV32-NEXT: and a3, a5, a3 -; RV32-NEXT: srli a5, a5, 16 -; RV32-NEXT: or a3, a3, a5 +; RV32-NEXT: sltu a4, a2, a0 +; RV32-NEXT: addi a5, a3, 1 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: mulhu a4, a2, a5 +; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: srli a4, a4, 16 +; RV32-NEXT: or a3, a3, a4 ; RV32-NEXT: sub a2, a2, a3 ; RV32-NEXT: sub a3, a0, a2 -; RV32-NEXT: mulhu a4, a3, a4 -; RV32-NEXT: slli a5, a3, 16 -; RV32-NEXT: sub a4, a4, a5 ; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mulhu a2, a3, a5 +; RV32-NEXT: slli a4, a3, 16 ; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a2, a2, a4 ; RV32-NEXT: slli a0, a1, 16 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: add a1, a4, a1 -; RV32-NEXT: sub a0, a3, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sub a0, a3, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_65537: ; RV64: # %bb.0: ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 1048560 -; RV64-NEXT: addiw a4, a3, 1 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: mulhu a5, a2, a4 -; RV64-NEXT: and a3, a5, a3 -; RV64-NEXT: srli a5, a5, 16 -; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: lui a4, 983041 +; RV64-NEXT: sltu a5, a2, a0 +; RV64-NEXT: addiw a6, a3, 1 +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, a6, 32 +; RV64-NEXT: addi a4, a4, -1 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: slli a4, a4, 16 +; RV64-NEXT: mulhu a6, a2, a5 +; RV64-NEXT: and a3, a6, a3 +; RV64-NEXT: srli a6, a6, 16 +; RV64-NEXT: add a3, a3, a6 ; RV64-NEXT: sub a2, a2, a3 ; RV64-NEXT: sub a3, a0, a2 -; RV64-NEXT: lui a5, 983041 -; RV64-NEXT: slli a5, a5, 4 -; RV64-NEXT: addi a5, a5, -1 -; RV64-NEXT: slli a5, a5, 16 -; RV64-NEXT: mul a5, a3, a5 -; RV64-NEXT: mulhu a6, a3, a4 -; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a3, a4 +; RV64-NEXT: mulhu a4, a3, a5 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a1, a1, a4 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a3, a4 +; RV64-NEXT: add a2, a4, a2 +; RV64-NEXT: mul a1, a1, a5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65537 ret iXLen2 %a @@ -527,59 +527,59 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: slli a2, a1, 30 ; RV32-NEXT: srli a0, a0, 2 -; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: srli a1, a1, 2 -; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: sltu a3, a2, a0 -; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: lui a3, 699051 -; RV32-NEXT: addi a4, a3, -1365 -; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: addi a2, a3, -1365 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: add a4, a0, a1 +; RV32-NEXT: sltu a5, a4, a0 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: mulhu a5, a4, a2 ; RV32-NEXT: srli a6, a5, 1 ; RV32-NEXT: andi a5, a5, -2 ; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sub a2, a2, a5 -; RV32-NEXT: sub a5, a0, a2 -; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: sub a4, a4, a5 +; RV32-NEXT: sub a5, a0, a4 +; RV32-NEXT: sltu a0, a0, a4 ; RV32-NEXT: mul a3, a5, a3 -; RV32-NEXT: mulhu a6, a5, a4 -; RV32-NEXT: add a3, a6, a3 -; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mulhu a4, a5, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: mul a0, a5, a4 +; RV32-NEXT: mul a0, a5, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_12: ; RV64: # %bb.0: ; RV64-NEXT: slli a2, a1, 62 ; RV64-NEXT: srli a0, a0, 2 -; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: srli a1, a1, 2 -; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: sltu a3, a2, a0 -; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: lui a3, 699051 -; RV64-NEXT: addiw a3, a3, -1365 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: mulhu a4, a2, a3 -; RV64-NEXT: srli a5, a4, 1 -; RV64-NEXT: andi a4, a4, -2 -; RV64-NEXT: lui a6, %hi(.LCPI10_0) -; RV64-NEXT: ld a6, %lo(.LCPI10_0)(a6) -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: sub a2, a2, a4 -; RV64-NEXT: sub a4, a0, a2 -; RV64-NEXT: mul a5, a4, a6 -; RV64-NEXT: mulhu a6, a4, a3 -; RV64-NEXT: add a5, a6, a5 -; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: lui a4, %hi(.LCPI10_0) +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: addiw a2, a3, -1365 +; RV64-NEXT: ld a3, %lo(.LCPI10_0)(a4) +; RV64-NEXT: add a4, a0, a1 +; RV64-NEXT: slli a5, a2, 32 +; RV64-NEXT: sltu a6, a4, a0 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: mulhu a5, a4, a2 +; RV64-NEXT: srli a6, a5, 1 +; RV64-NEXT: andi a5, a5, -2 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: sub a4, a4, a5 +; RV64-NEXT: sub a5, a0, a4 +; RV64-NEXT: sltu a0, a0, a4 +; RV64-NEXT: mul a3, a5, a3 +; RV64-NEXT: mulhu a4, a5, a2 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, a5, a1 -; RV64-NEXT: mul a0, a4, a3 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: mul a0, a5, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll index 8444520fcc771..ae8117c3ce0bd 100644 --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -8,11 +8,11 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_3: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 699051 ; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a2, a2, -1365 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 699051 -; RV32-NEXT: addi a1, a1, -1365 -; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: mulhu a1, a0, a2 ; RV32-NEXT: srli a2, a1, 1 ; RV32-NEXT: andi a1, a1, -2 ; RV32-NEXT: add a1, a1, a2 @@ -23,12 +23,12 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_3: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 699051 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, -1365 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 699051 -; RV64-NEXT: addiw a1, a1, -1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a2, a1, 1 ; RV64-NEXT: andi a1, a1, -2 @@ -44,11 +44,11 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_5: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 838861 ; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a2, a2, -819 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 838861 -; RV32-NEXT: addi a1, a1, -819 -; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: mulhu a1, a0, a2 ; RV32-NEXT: srli a2, a1, 2 ; RV32-NEXT: andi a1, a1, -4 ; RV32-NEXT: add a1, a1, a2 @@ -59,12 +59,12 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_5: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 838861 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, -819 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 838861 -; RV64-NEXT: addiw a1, a1, -819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a2, a1, 2 ; RV64-NEXT: andi a1, a1, -4 @@ -132,10 +132,10 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_15: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 559241 ; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 559241 -; RV32-NEXT: addi a1, a1, -1911 +; RV32-NEXT: addi a1, a2, -1911 ; RV32-NEXT: mulhu a1, a0, a1 ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: slli a2, a1, 4 @@ -147,12 +147,12 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_15: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 559241 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, -1911 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 559241 -; RV64-NEXT: addiw a1, a1, -1911 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: slli a2, a1, 4 @@ -168,11 +168,11 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_17: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 986895 ; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a2, a2, 241 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 986895 -; RV32-NEXT: addi a1, a1, 241 -; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: mulhu a1, a0, a2 ; RV32-NEXT: srli a2, a1, 4 ; RV32-NEXT: andi a1, a1, -16 ; RV32-NEXT: add a1, a1, a2 @@ -183,12 +183,12 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_17: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 986895 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, 241 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 986895 -; RV64-NEXT: addiw a1, a1, 241 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a2, a1, 4 ; RV64-NEXT: andi a1, a1, -16 @@ -204,10 +204,10 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_255: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 526344 ; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 526344 -; RV32-NEXT: addi a1, a1, 129 +; RV32-NEXT: addi a1, a2, 129 ; RV32-NEXT: mulhu a1, a0, a1 ; RV32-NEXT: srli a1, a1, 7 ; RV32-NEXT: slli a2, a1, 8 @@ -219,12 +219,12 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_255: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 526344 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, 129 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 526344 -; RV64-NEXT: addiw a1, a1, 129 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a1, a1, 7 ; RV64-NEXT: slli a2, a1, 8 @@ -240,11 +240,11 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_257: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 1044496 ; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: addi a2, a2, -255 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 1044496 -; RV32-NEXT: addi a1, a1, -255 -; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: mulhu a1, a0, a2 ; RV32-NEXT: srli a2, a1, 8 ; RV32-NEXT: andi a1, a1, -256 ; RV32-NEXT: add a1, a1, a2 @@ -255,12 +255,12 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_257: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 1044496 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, -255 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 1044496 -; RV64-NEXT: addiw a1, a1, -255 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a2, a1, 8 ; RV64-NEXT: andi a1, a1, -256 @@ -276,11 +276,11 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_65535: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 524296 ; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 524296 -; RV32-NEXT: addi a1, a1, 1 -; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: addi a2, a2, 1 +; RV32-NEXT: mulhu a1, a0, a2 ; RV32-NEXT: srli a1, a1, 15 ; RV32-NEXT: slli a2, a1, 16 ; RV32-NEXT: sub a1, a1, a2 @@ -291,12 +291,12 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_65535: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 524296 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a2, a2, 1 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 524296 -; RV64-NEXT: addiw a1, a1, 1 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a1, a1, 15 ; RV64-NEXT: slli a2, a1, 16 @@ -312,14 +312,14 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_65537: ; RV32: # %bb.0: ; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: lui a2, 1048560 ; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lui a1, 1048560 -; RV32-NEXT: addi a2, a1, 1 -; RV32-NEXT: mulhu a2, a0, a2 -; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: srli a2, a2, 16 -; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: addi a1, a2, 1 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: and a2, a1, a2 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: or a1, a2, a1 ; RV32-NEXT: sub a0, a0, a1 ; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret @@ -327,16 +327,16 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind { ; RV64-LABEL: test_urem_65537: ; RV64: # %bb.0: ; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: lui a2, 1048560 ; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addiw a3, a2, 1 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: lui a1, 1048560 -; RV64-NEXT: addiw a2, a1, 1 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: mulhu a2, a0, a2 -; RV64-NEXT: and a1, a2, a1 -; RV64-NEXT: srli a2, a2, 16 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a1, a3, 32 +; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: mulhu a1, a0, a1 +; RV64-NEXT: and a2, a1, a2 +; RV64-NEXT: srli a1, a1, 16 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret @@ -349,14 +349,14 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: slli a2, a1, 30 ; RV32-NEXT: srli a3, a0, 2 -; RV32-NEXT: or a2, a3, a2 ; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: lui a3, 699051 +; RV32-NEXT: addi a3, a3, -1365 ; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: sltu a2, a1, a2 ; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: lui a2, 699051 -; RV32-NEXT: addi a2, a2, -1365 -; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: mulhu a2, a1, a3 ; RV32-NEXT: srli a3, a2, 1 ; RV32-NEXT: andi a2, a2, -2 ; RV32-NEXT: add a2, a2, a3 @@ -371,16 +371,16 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: slli a2, a1, 62 ; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: lui a4, 699051 ; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: addiw a3, a4, -1365 +; RV64-NEXT: slli a4, a3, 32 +; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: sltu a2, a1, a2 ; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: lui a2, 699051 -; RV64-NEXT: addiw a2, a2, -1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: mulhu a2, a1, a2 +; RV64-NEXT: mulhu a2, a1, a3 ; RV64-NEXT: srli a3, a2, 1 ; RV64-NEXT: andi a2, a2, -2 ; RV64-NEXT: add a2, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 3ccad02fbb2bf..4c9d9e5ffdf77 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -17,9 +17,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV32-NEXT: lui a1, 662 ; RV32-NEXT: addi a1, a1, -83 ; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: lui a1, 1324 ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: srli a0, a0, 3 -; RV32-NEXT: lui a1, 1324 ; RV32-NEXT: addi a1, a1, -165 ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -36,9 +36,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64-NEXT: lui a1, 662 ; RV64-NEXT: addi a1, a1, -83 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: lui a1, 1324 ; RV64-NEXT: slli a0, a0, 35 ; RV64-NEXT: srli a0, a0, 35 -; RV64-NEXT: lui a1, 1324 ; RV64-NEXT: addiw a1, a1, -165 ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -53,9 +53,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV32M-NEXT: lui a1, 662 ; RV32M-NEXT: addi a1, a1, -83 ; RV32M-NEXT: add a0, a0, a1 +; RV32M-NEXT: lui a1, 1324 ; RV32M-NEXT: slli a0, a0, 3 ; RV32M-NEXT: srli a0, a0, 3 -; RV32M-NEXT: lui a1, 1324 ; RV32M-NEXT: addi a1, a1, -165 ; RV32M-NEXT: sltu a0, a0, a1 ; RV32M-NEXT: ret @@ -68,9 +68,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64M-NEXT: lui a1, 662 ; RV64M-NEXT: addi a1, a1, -83 ; RV64M-NEXT: add a0, a0, a1 +; RV64M-NEXT: lui a1, 1324 ; RV64M-NEXT: slli a0, a0, 35 ; RV64M-NEXT: srli a0, a0, 35 -; RV64M-NEXT: lui a1, 1324 ; RV64M-NEXT: addiw a1, a1, -165 ; RV64M-NEXT: sltu a0, a0, a1 ; RV64M-NEXT: ret @@ -83,9 +83,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV32MV-NEXT: lui a1, 662 ; RV32MV-NEXT: addi a1, a1, -83 ; RV32MV-NEXT: add a0, a0, a1 +; RV32MV-NEXT: lui a1, 1324 ; RV32MV-NEXT: slli a0, a0, 3 ; RV32MV-NEXT: srli a0, a0, 3 -; RV32MV-NEXT: lui a1, 1324 ; RV32MV-NEXT: addi a1, a1, -165 ; RV32MV-NEXT: sltu a0, a0, a1 ; RV32MV-NEXT: ret @@ -98,9 +98,9 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64MV-NEXT: lui a1, 662 ; RV64MV-NEXT: addi a1, a1, -83 ; RV64MV-NEXT: add a0, a0, a1 +; RV64MV-NEXT: lui a1, 1324 ; RV64MV-NEXT: slli a0, a0, 35 ; RV64MV-NEXT: srli a0, a0, 35 -; RV64MV-NEXT: lui a1, 1324 ; RV64MV-NEXT: addiw a1, a1, -165 ; RV64MV-NEXT: sltu a0, a0, a1 ; RV64MV-NEXT: ret @@ -310,18 +310,18 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32-NEXT: lw a0, 0(a0) ; RV32-NEXT: slli a4, a1, 30 ; RV32-NEXT: srli s1, a2, 2 +; RV32-NEXT: slli a5, a2, 31 ; RV32-NEXT: or s1, s1, a4 -; RV32-NEXT: slli a4, a2, 31 -; RV32-NEXT: srli a5, a3, 1 -; RV32-NEXT: or s2, a5, a4 +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: or s2, a4, a5 ; RV32-NEXT: srli a1, a1, 2 -; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: srai s3, a1, 31 ; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: slli a3, a3, 31 +; RV32-NEXT: slli a1, a1, 31 ; RV32-NEXT: slli a2, a2, 31 +; RV32-NEXT: srai s3, a1, 31 ; RV32-NEXT: srai s4, a2, 31 -; RV32-NEXT: slli a1, a3, 31 -; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: srai a1, a3, 31 ; RV32-NEXT: li a2, 6 ; RV32-NEXT: li a3, 0 ; RV32-NEXT: call __moddi3 @@ -340,12 +340,12 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32-NEXT: mv a1, s3 ; RV32-NEXT: call __moddi3 ; RV32-NEXT: or a2, s5, s6 -; RV32-NEXT: snez a2, a2 ; RV32-NEXT: xori a0, a0, 2 +; RV32-NEXT: xori a3, s2, 1 +; RV32-NEXT: snez a2, a2 ; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: or a1, a3, s4 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: xori a1, s2, 1 -; RV32-NEXT: or a1, a1, s4 ; RV32-NEXT: seqz a1, a1 ; RV32-NEXT: neg a3, a2 ; RV32-NEXT: addi a1, a1, -1 @@ -355,8 +355,8 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32-NEXT: or a2, a5, a2 ; RV32-NEXT: srli a5, a1, 31 ; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: or a0, a5, a0 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: sw a3, 0(s0) @@ -384,18 +384,18 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lbu a0, 12(a0) -; RV64-NEXT: lwu a1, 8(s0) -; RV64-NEXT: ld a2, 0(s0) +; RV64-NEXT: ld a1, 0(s0) +; RV64-NEXT: lwu a2, 8(s0) ; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: slli a0, a0, 29 -; RV64-NEXT: srai s1, a0, 31 -; RV64-NEXT: srli a0, a2, 2 -; RV64-NEXT: slli a1, a1, 62 -; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: srai a0, a0, 31 -; RV64-NEXT: slli a2, a2, 31 -; RV64-NEXT: srai s2, a2, 31 +; RV64-NEXT: srli a3, a1, 2 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: slli a2, a2, 62 +; RV64-NEXT: slli a1, a1, 31 +; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: slli s1, a0, 29 +; RV64-NEXT: srai a0, a2, 31 +; RV64-NEXT: srai s1, s1, 31 +; RV64-NEXT: srai s2, a1, 31 ; RV64-NEXT: li a1, 7 ; RV64-NEXT: call __moddi3 ; RV64-NEXT: mv s3, a0 @@ -410,32 +410,32 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64-NEXT: mv a0, s2 ; RV64-NEXT: call __muldi3 ; RV64-NEXT: lui a1, %hi(.LCPI3_0) +; RV64-NEXT: addi s1, s1, -2 +; RV64-NEXT: addi s3, s3, -1 ; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) +; RV64-NEXT: seqz a2, s1 +; RV64-NEXT: seqz a3, s3 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: slli a4, a2, 2 +; RV64-NEXT: slli a5, a3, 31 +; RV64-NEXT: srli a5, a5, 62 ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: slli a2, a0, 63 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: slli a5, a0, 63 ; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: or a0, a0, a5 +; RV64-NEXT: slli a2, a2, 29 +; RV64-NEXT: slli a3, a3, 33 +; RV64-NEXT: srli a2, a2, 61 ; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: addi s1, s1, -2 -; RV64-NEXT: seqz a1, s1 -; RV64-NEXT: addi s3, s3, -1 -; RV64-NEXT: seqz a2, s3 ; RV64-NEXT: neg a0, a0 -; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: slli a3, a1, 2 -; RV64-NEXT: slli a4, a2, 31 -; RV64-NEXT: srli a4, a4, 62 -; RV64-NEXT: or a3, a4, a3 -; RV64-NEXT: slli a1, a1, 29 -; RV64-NEXT: srli a1, a1, 61 ; RV64-NEXT: slli a0, a0, 31 ; RV64-NEXT: srli a0, a0, 31 -; RV64-NEXT: slli a2, a2, 33 -; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: or a0, a0, a3 ; RV64-NEXT: sd a0, 0(s0) -; RV64-NEXT: sw a3, 8(s0) -; RV64-NEXT: sb a1, 12(s0) +; RV64-NEXT: sw a4, 8(s0) +; RV64-NEXT: sb a2, 12(s0) ; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -462,18 +462,18 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32M-NEXT: lw a0, 0(a0) ; RV32M-NEXT: slli a4, a1, 30 ; RV32M-NEXT: srli s1, a2, 2 +; RV32M-NEXT: slli a5, a2, 31 ; RV32M-NEXT: or s1, s1, a4 -; RV32M-NEXT: slli a4, a2, 31 -; RV32M-NEXT: srli a5, a3, 1 -; RV32M-NEXT: or s2, a5, a4 +; RV32M-NEXT: srli a4, a3, 1 +; RV32M-NEXT: or s2, a4, a5 ; RV32M-NEXT: srli a1, a1, 2 -; RV32M-NEXT: slli a1, a1, 31 -; RV32M-NEXT: srai s3, a1, 31 ; RV32M-NEXT: srli a2, a2, 1 +; RV32M-NEXT: slli a3, a3, 31 +; RV32M-NEXT: slli a1, a1, 31 ; RV32M-NEXT: slli a2, a2, 31 +; RV32M-NEXT: srai s3, a1, 31 ; RV32M-NEXT: srai s4, a2, 31 -; RV32M-NEXT: slli a1, a3, 31 -; RV32M-NEXT: srai a1, a1, 31 +; RV32M-NEXT: srai a1, a3, 31 ; RV32M-NEXT: li a2, 6 ; RV32M-NEXT: li a3, 0 ; RV32M-NEXT: call __moddi3 @@ -492,12 +492,12 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32M-NEXT: mv a1, s3 ; RV32M-NEXT: call __moddi3 ; RV32M-NEXT: or a2, s5, s6 -; RV32M-NEXT: snez a2, a2 ; RV32M-NEXT: xori a0, a0, 2 +; RV32M-NEXT: xori a3, s2, 1 +; RV32M-NEXT: snez a2, a2 ; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: or a1, a3, s4 ; RV32M-NEXT: seqz a0, a0 -; RV32M-NEXT: xori a1, s2, 1 -; RV32M-NEXT: or a1, a1, s4 ; RV32M-NEXT: seqz a1, a1 ; RV32M-NEXT: neg a3, a2 ; RV32M-NEXT: addi a1, a1, -1 @@ -507,8 +507,8 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32M-NEXT: or a2, a5, a2 ; RV32M-NEXT: srli a5, a1, 31 ; RV32M-NEXT: andi a1, a1, 1 -; RV32M-NEXT: slli a1, a1, 1 ; RV32M-NEXT: slli a0, a0, 2 +; RV32M-NEXT: slli a1, a1, 1 ; RV32M-NEXT: or a0, a5, a0 ; RV32M-NEXT: or a0, a0, a1 ; RV32M-NEXT: sw a3, 0(s0) @@ -531,62 +531,62 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64M-NEXT: ld a1, 0(a0) ; RV64M-NEXT: lwu a2, 8(a0) ; RV64M-NEXT: lbu a3, 12(a0) -; RV64M-NEXT: srli a4, a1, 2 -; RV64M-NEXT: slli a5, a2, 62 -; RV64M-NEXT: or a4, a5, a4 -; RV64M-NEXT: srai a4, a4, 31 +; RV64M-NEXT: lui a4, %hi(.LCPI3_0) +; RV64M-NEXT: lui a5, 699051 +; RV64M-NEXT: addiw a5, a5, -1365 +; RV64M-NEXT: slli a6, a5, 32 +; RV64M-NEXT: add a5, a5, a6 +; RV64M-NEXT: srli a6, a1, 2 +; RV64M-NEXT: slli a7, a2, 62 +; RV64M-NEXT: or a6, a7, a6 +; RV64M-NEXT: lui a7, %hi(.LCPI3_1) ; RV64M-NEXT: slli a3, a3, 32 ; RV64M-NEXT: or a2, a2, a3 -; RV64M-NEXT: slli a2, a2, 29 -; RV64M-NEXT: lui a3, %hi(.LCPI3_0) -; RV64M-NEXT: ld a3, %lo(.LCPI3_0)(a3) -; RV64M-NEXT: srai a2, a2, 31 +; RV64M-NEXT: lui a3, %hi(.LCPI3_2) +; RV64M-NEXT: ld a4, %lo(.LCPI3_0)(a4) +; RV64M-NEXT: ld a7, %lo(.LCPI3_1)(a7) +; RV64M-NEXT: ld a3, %lo(.LCPI3_2)(a3) ; RV64M-NEXT: slli a1, a1, 31 ; RV64M-NEXT: srai a1, a1, 31 -; RV64M-NEXT: mulh a3, a2, a3 -; RV64M-NEXT: srli a5, a3, 63 -; RV64M-NEXT: srai a3, a3, 1 -; RV64M-NEXT: add a3, a3, a5 -; RV64M-NEXT: lui a5, %hi(.LCPI3_1) -; RV64M-NEXT: ld a5, %lo(.LCPI3_1)(a5) -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: slli a3, a3, 2 -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: mulh a3, a4, a5 -; RV64M-NEXT: srli a5, a3, 63 -; RV64M-NEXT: srai a3, a3, 1 -; RV64M-NEXT: add a3, a3, a5 -; RV64M-NEXT: slli a5, a3, 3 -; RV64M-NEXT: add a3, a4, a3 -; RV64M-NEXT: sub a3, a3, a5 +; RV64M-NEXT: srai a6, a6, 31 +; RV64M-NEXT: slli a2, a2, 29 +; RV64M-NEXT: mul a1, a1, a5 +; RV64M-NEXT: srai a2, a2, 31 +; RV64M-NEXT: mulh a5, a6, a7 +; RV64M-NEXT: add a1, a1, a3 +; RV64M-NEXT: mulh a4, a2, a4 +; RV64M-NEXT: srli a7, a5, 63 +; RV64M-NEXT: srai a5, a5, 1 +; RV64M-NEXT: add a5, a5, a7 +; RV64M-NEXT: slli a7, a1, 63 +; RV64M-NEXT: srli a1, a1, 1 +; RV64M-NEXT: or a1, a1, a7 +; RV64M-NEXT: srli a7, a4, 63 +; RV64M-NEXT: srai a4, a4, 1 +; RV64M-NEXT: add a4, a4, a7 +; RV64M-NEXT: sltu a1, a3, a1 +; RV64M-NEXT: add a6, a6, a5 +; RV64M-NEXT: slli a5, a5, 3 +; RV64M-NEXT: add a2, a2, a4 +; RV64M-NEXT: slli a4, a4, 2 +; RV64M-NEXT: sub a3, a6, a5 +; RV64M-NEXT: neg a1, a1 +; RV64M-NEXT: add a2, a2, a4 ; RV64M-NEXT: addi a3, a3, -1 +; RV64M-NEXT: slli a1, a1, 31 ; RV64M-NEXT: seqz a3, a3 -; RV64M-NEXT: lui a4, 699051 -; RV64M-NEXT: addiw a4, a4, -1365 -; RV64M-NEXT: slli a5, a4, 32 -; RV64M-NEXT: add a4, a4, a5 -; RV64M-NEXT: lui a5, %hi(.LCPI3_2) -; RV64M-NEXT: ld a5, %lo(.LCPI3_2)(a5) ; RV64M-NEXT: addi a2, a2, -2 +; RV64M-NEXT: srli a1, a1, 31 ; RV64M-NEXT: seqz a2, a2 -; RV64M-NEXT: mul a1, a1, a4 -; RV64M-NEXT: add a1, a1, a5 -; RV64M-NEXT: slli a4, a1, 63 -; RV64M-NEXT: srli a1, a1, 1 -; RV64M-NEXT: or a1, a1, a4 -; RV64M-NEXT: sltu a1, a5, a1 -; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: addi a3, a3, -1 -; RV64M-NEXT: neg a1, a1 +; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: slli a4, a3, 33 -; RV64M-NEXT: slli a1, a1, 31 -; RV64M-NEXT: srli a1, a1, 31 +; RV64M-NEXT: slli a3, a3, 31 ; RV64M-NEXT: or a1, a1, a4 ; RV64M-NEXT: slli a4, a2, 2 -; RV64M-NEXT: slli a3, a3, 31 ; RV64M-NEXT: srli a3, a3, 62 -; RV64M-NEXT: or a3, a3, a4 ; RV64M-NEXT: slli a2, a2, 29 +; RV64M-NEXT: or a3, a3, a4 ; RV64M-NEXT: srli a2, a2, 61 ; RV64M-NEXT: sd a1, 0(a0) ; RV64M-NEXT: sw a3, 8(a0) @@ -606,28 +606,28 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: slli a1, a1, 1 ; RV32MV-NEXT: sub sp, sp, a1 ; RV32MV-NEXT: mv s0, a0 -; RV32MV-NEXT: lbu a1, 12(a0) -; RV32MV-NEXT: lw a2, 8(a0) +; RV32MV-NEXT: lw a1, 8(a0) +; RV32MV-NEXT: lbu a2, 12(a0) ; RV32MV-NEXT: lw a3, 4(a0) ; RV32MV-NEXT: lw a0, 0(a0) -; RV32MV-NEXT: slli a4, a1, 30 -; RV32MV-NEXT: srli s1, a2, 2 -; RV32MV-NEXT: or s1, s1, a4 -; RV32MV-NEXT: slli a4, a2, 31 +; RV32MV-NEXT: li a4, 1 +; RV32MV-NEXT: slli a5, a2, 30 +; RV32MV-NEXT: srli s1, a1, 2 +; RV32MV-NEXT: slli a6, a1, 31 +; RV32MV-NEXT: or s1, s1, a5 ; RV32MV-NEXT: srli a5, a3, 1 -; RV32MV-NEXT: or s2, a5, a4 -; RV32MV-NEXT: srli a1, a1, 2 -; RV32MV-NEXT: slli a1, a1, 31 -; RV32MV-NEXT: srai s3, a1, 31 -; RV32MV-NEXT: srli a2, a2, 1 +; RV32MV-NEXT: or s2, a5, a6 +; RV32MV-NEXT: li a5, -1 +; RV32MV-NEXT: srli a2, a2, 2 +; RV32MV-NEXT: srli a1, a1, 1 +; RV32MV-NEXT: slli a3, a3, 31 ; RV32MV-NEXT: slli a2, a2, 31 -; RV32MV-NEXT: srai s4, a2, 31 -; RV32MV-NEXT: slli a1, a3, 31 -; RV32MV-NEXT: srai a1, a1, 31 -; RV32MV-NEXT: li a2, 1 -; RV32MV-NEXT: li a3, -1 -; RV32MV-NEXT: sw a3, 16(sp) -; RV32MV-NEXT: sw a2, 20(sp) +; RV32MV-NEXT: slli a6, a1, 31 +; RV32MV-NEXT: srai a1, a3, 31 +; RV32MV-NEXT: srai s3, a2, 31 +; RV32MV-NEXT: srai s4, a6, 31 +; RV32MV-NEXT: sw a5, 16(sp) +; RV32MV-NEXT: sw a4, 20(sp) ; RV32MV-NEXT: li a2, 6 ; RV32MV-NEXT: li a3, 0 ; RV32MV-NEXT: call __moddi3 @@ -681,33 +681,33 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32MV-NEXT: vslidedown.vi v10, v8, 1 +; RV32MV-NEXT: vslidedown.vi v11, v8, 2 ; RV32MV-NEXT: vmv.x.s a0, v10 -; RV32MV-NEXT: vslidedown.vi v10, v8, 2 -; RV32MV-NEXT: vmv.x.s a1, v10 -; RV32MV-NEXT: slli a2, a1, 1 -; RV32MV-NEXT: sub a2, a2, a0 +; RV32MV-NEXT: vmv.x.s a1, v11 ; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32MV-NEXT: vslidedown.vi v10, v8, 4 -; RV32MV-NEXT: vmv.x.s a0, v10 -; RV32MV-NEXT: srli a3, a0, 30 +; RV32MV-NEXT: vmv.x.s a2, v10 ; RV32MV-NEXT: vslidedown.vi v10, v8, 5 -; RV32MV-NEXT: vmv.x.s a4, v10 -; RV32MV-NEXT: slli a4, a4, 2 -; RV32MV-NEXT: or a3, a4, a3 +; RV32MV-NEXT: vmv.x.s a3, v10 +; RV32MV-NEXT: slli a4, a1, 1 +; RV32MV-NEXT: sub a4, a4, a0 +; RV32MV-NEXT: srli a0, a2, 30 +; RV32MV-NEXT: slli a3, a3, 2 +; RV32MV-NEXT: or a0, a3, a0 ; RV32MV-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32MV-NEXT: vse32.v v8, (s0) -; RV32MV-NEXT: andi a3, a3, 7 -; RV32MV-NEXT: srli a1, a1, 31 ; RV32MV-NEXT: vslidedown.vi v8, v8, 3 -; RV32MV-NEXT: slli a0, a0, 2 -; RV32MV-NEXT: or a0, a1, a0 -; RV32MV-NEXT: vmv.x.s a1, v8 -; RV32MV-NEXT: andi a1, a1, 1 -; RV32MV-NEXT: slli a1, a1, 1 -; RV32MV-NEXT: or a0, a0, a1 -; RV32MV-NEXT: sw a2, 4(s0) -; RV32MV-NEXT: sw a0, 8(s0) -; RV32MV-NEXT: sb a3, 12(s0) +; RV32MV-NEXT: srli a1, a1, 31 +; RV32MV-NEXT: slli a2, a2, 2 +; RV32MV-NEXT: or a1, a1, a2 +; RV32MV-NEXT: vmv.x.s a2, v8 +; RV32MV-NEXT: andi a2, a2, 1 +; RV32MV-NEXT: slli a2, a2, 1 +; RV32MV-NEXT: andi a0, a0, 7 +; RV32MV-NEXT: or a1, a1, a2 +; RV32MV-NEXT: sw a4, 4(s0) +; RV32MV-NEXT: sw a1, 8(s0) +; RV32MV-NEXT: sb a0, 12(s0) ; RV32MV-NEXT: csrr a0, vlenb ; RV32MV-NEXT: slli a0, a0, 1 ; RV32MV-NEXT: add sp, sp, a0 @@ -722,79 +722,79 @@ define void @test_srem_vec(ptr %X) nounwind { ; ; RV64MV-LABEL: test_srem_vec: ; RV64MV: # %bb.0: -; RV64MV-NEXT: lbu a1, 12(a0) +; RV64MV-NEXT: ld a1, 0(a0) ; RV64MV-NEXT: lwu a2, 8(a0) -; RV64MV-NEXT: ld a3, 0(a0) -; RV64MV-NEXT: slli a1, a1, 32 -; RV64MV-NEXT: or a1, a2, a1 -; RV64MV-NEXT: slli a1, a1, 29 -; RV64MV-NEXT: srai a1, a1, 31 -; RV64MV-NEXT: srli a4, a3, 2 -; RV64MV-NEXT: slli a2, a2, 62 -; RV64MV-NEXT: or a2, a2, a4 +; RV64MV-NEXT: lbu a3, 12(a0) ; RV64MV-NEXT: lui a4, %hi(.LCPI3_0) +; RV64MV-NEXT: lui a5, %hi(.LCPI3_1) +; RV64MV-NEXT: lui a6, %hi(.LCPI3_2) +; RV64MV-NEXT: lui a7, 32 ; RV64MV-NEXT: ld a4, %lo(.LCPI3_0)(a4) +; RV64MV-NEXT: ld a5, %lo(.LCPI3_1)(a5) +; RV64MV-NEXT: ld a6, %lo(.LCPI3_2)(a6) +; RV64MV-NEXT: addi a7, a7, 256 +; RV64MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64MV-NEXT: vmv.s.x v8, a7 +; RV64MV-NEXT: slli a3, a3, 32 +; RV64MV-NEXT: srli a7, a1, 2 +; RV64MV-NEXT: or a3, a2, a3 +; RV64MV-NEXT: slli a2, a2, 62 +; RV64MV-NEXT: slli a1, a1, 31 +; RV64MV-NEXT: or a2, a2, a7 +; RV64MV-NEXT: srai a1, a1, 31 +; RV64MV-NEXT: slli a3, a3, 29 ; RV64MV-NEXT: srai a2, a2, 31 -; RV64MV-NEXT: slli a3, a3, 31 +; RV64MV-NEXT: mulh a5, a1, a5 ; RV64MV-NEXT: srai a3, a3, 31 ; RV64MV-NEXT: mulh a4, a2, a4 -; RV64MV-NEXT: srli a5, a4, 63 +; RV64MV-NEXT: srli a7, a5, 63 +; RV64MV-NEXT: add a5, a5, a7 +; RV64MV-NEXT: srli a7, a4, 63 ; RV64MV-NEXT: srai a4, a4, 1 -; RV64MV-NEXT: add a4, a4, a5 -; RV64MV-NEXT: lui a5, %hi(.LCPI3_1) -; RV64MV-NEXT: ld a5, %lo(.LCPI3_1)(a5) +; RV64MV-NEXT: mulh a6, a3, a6 +; RV64MV-NEXT: add a4, a4, a7 +; RV64MV-NEXT: slli a7, a5, 3 +; RV64MV-NEXT: slli a5, a5, 1 +; RV64MV-NEXT: sub a5, a5, a7 +; RV64MV-NEXT: srli a7, a6, 63 +; RV64MV-NEXT: srai a6, a6, 1 +; RV64MV-NEXT: add a6, a6, a7 ; RV64MV-NEXT: add a2, a2, a4 ; RV64MV-NEXT: slli a4, a4, 3 ; RV64MV-NEXT: sub a2, a2, a4 -; RV64MV-NEXT: mulh a4, a3, a5 -; RV64MV-NEXT: srli a5, a4, 63 -; RV64MV-NEXT: add a4, a4, a5 -; RV64MV-NEXT: slli a5, a4, 3 -; RV64MV-NEXT: slli a4, a4, 1 -; RV64MV-NEXT: sub a4, a4, a5 -; RV64MV-NEXT: lui a5, %hi(.LCPI3_2) -; RV64MV-NEXT: ld a5, %lo(.LCPI3_2)(a5) -; RV64MV-NEXT: add a3, a3, a4 -; RV64MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64MV-NEXT: vmv.v.x v8, a3 +; RV64MV-NEXT: add a1, a1, a5 +; RV64MV-NEXT: li a4, -1 +; RV64MV-NEXT: srli a4, a4, 31 +; RV64MV-NEXT: vsext.vf8 v10, v8 +; RV64MV-NEXT: add a3, a3, a6 +; RV64MV-NEXT: slli a6, a6, 2 +; RV64MV-NEXT: vmv.v.x v8, a1 +; RV64MV-NEXT: add a3, a3, a6 ; RV64MV-NEXT: vslide1down.vx v8, v8, a2 -; RV64MV-NEXT: mulh a2, a1, a5 -; RV64MV-NEXT: srli a3, a2, 63 -; RV64MV-NEXT: srai a2, a2, 1 -; RV64MV-NEXT: add a2, a2, a3 -; RV64MV-NEXT: slli a3, a2, 2 -; RV64MV-NEXT: add a1, a1, a2 -; RV64MV-NEXT: add a1, a1, a3 -; RV64MV-NEXT: vslide1down.vx v8, v8, a1 +; RV64MV-NEXT: vslide1down.vx v8, v8, a3 ; RV64MV-NEXT: vslidedown.vi v8, v8, 1 -; RV64MV-NEXT: li a1, -1 -; RV64MV-NEXT: srli a1, a1, 31 -; RV64MV-NEXT: vand.vx v8, v8, a1 -; RV64MV-NEXT: lui a2, 32 -; RV64MV-NEXT: addi a2, a2, 256 -; RV64MV-NEXT: vmv.s.x v10, a2 -; RV64MV-NEXT: vsext.vf8 v12, v10 -; RV64MV-NEXT: vmsne.vv v0, v8, v12 +; RV64MV-NEXT: vand.vx v8, v8, a4 +; RV64MV-NEXT: vmsne.vv v0, v8, v10 ; RV64MV-NEXT: vmv.v.i v8, 0 ; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64MV-NEXT: vslidedown.vi v10, v8, 2 -; RV64MV-NEXT: vmv.x.s a2, v10 -; RV64MV-NEXT: slli a3, a2, 31 -; RV64MV-NEXT: srli a3, a3, 61 -; RV64MV-NEXT: vmv.x.s a4, v8 -; RV64MV-NEXT: and a1, a4, a1 +; RV64MV-NEXT: vmv.x.s a1, v8 ; RV64MV-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64MV-NEXT: vslidedown.vi v8, v8, 1 -; RV64MV-NEXT: vmv.x.s a4, v8 -; RV64MV-NEXT: slli a5, a4, 33 -; RV64MV-NEXT: or a1, a1, a5 +; RV64MV-NEXT: vmv.x.s a2, v10 +; RV64MV-NEXT: and a1, a1, a4 +; RV64MV-NEXT: vmv.x.s a3, v8 +; RV64MV-NEXT: slli a4, a2, 31 +; RV64MV-NEXT: slli a5, a3, 33 ; RV64MV-NEXT: slli a2, a2, 2 -; RV64MV-NEXT: slli a4, a4, 31 -; RV64MV-NEXT: srli a4, a4, 62 -; RV64MV-NEXT: or a2, a4, a2 +; RV64MV-NEXT: slli a3, a3, 31 +; RV64MV-NEXT: srli a4, a4, 61 +; RV64MV-NEXT: or a1, a1, a5 +; RV64MV-NEXT: srli a3, a3, 62 +; RV64MV-NEXT: or a2, a3, a2 ; RV64MV-NEXT: sd a1, 0(a0) ; RV64MV-NEXT: sw a2, 8(a0) -; RV64MV-NEXT: sb a3, 12(a0) +; RV64MV-NEXT: sb a4, 12(a0) ; RV64MV-NEXT: ret %ld = load <3 x i33>, ptr %X %srem = srem <3 x i33> %ld, diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index c7e57021b90dc..cf65d4e0cf805 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -53,50 +53,50 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_srem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 0(a1) -; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a3, 0(a1) +; RV32IM-NEXT: lh a2, 4(a1) ; RV32IM-NEXT: lh a4, 8(a1) ; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: lui a6, 507375 +; RV32IM-NEXT: lui a7, 342392 +; RV32IM-NEXT: lui t0, 780943 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a2, a5 -; RV32IM-NEXT: add a5, a5, a2 -; RV32IM-NEXT: srli a6, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: li a6, 95 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: lui a5, 507375 -; RV32IM-NEXT: addi a5, a5, 1981 +; RV32IM-NEXT: addi a6, a6, 1981 +; RV32IM-NEXT: addi a7, a7, 669 +; RV32IM-NEXT: addi t0, t0, 1809 ; RV32IM-NEXT: mulh a5, a3, a5 -; RV32IM-NEXT: sub a5, a5, a3 -; RV32IM-NEXT: srli a6, a5, 31 +; RV32IM-NEXT: mulh a6, a2, a6 +; RV32IM-NEXT: mulh a7, a4, a7 +; RV32IM-NEXT: mulh t0, a1, t0 +; RV32IM-NEXT: add a5, a5, a3 +; RV32IM-NEXT: sub a6, a6, a2 +; RV32IM-NEXT: srli t1, a7, 31 +; RV32IM-NEXT: srli a7, a7, 5 +; RV32IM-NEXT: add a7, a7, t1 +; RV32IM-NEXT: srli t1, t0, 31 +; RV32IM-NEXT: srli t0, t0, 8 +; RV32IM-NEXT: add t0, t0, t1 +; RV32IM-NEXT: srli t1, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: li a6, -124 -; RV32IM-NEXT: mul a5, a5, a6 +; RV32IM-NEXT: add a5, a5, t1 +; RV32IM-NEXT: srli t1, a6, 31 +; RV32IM-NEXT: srli a6, a6, 6 +; RV32IM-NEXT: add a6, a6, t1 +; RV32IM-NEXT: li t1, 98 +; RV32IM-NEXT: mul a7, a7, t1 +; RV32IM-NEXT: li t1, -1003 +; RV32IM-NEXT: mul t0, t0, t1 +; RV32IM-NEXT: li t1, 95 +; RV32IM-NEXT: mul a5, a5, t1 +; RV32IM-NEXT: li t1, -124 +; RV32IM-NEXT: mul a6, a6, t1 +; RV32IM-NEXT: sub a4, a4, a7 +; RV32IM-NEXT: sub a1, a1, t0 ; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: lui a5, 342392 -; RV32IM-NEXT: addi a5, a5, 669 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: srli a6, a5, 31 -; RV32IM-NEXT: srli a5, a5, 5 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: li a6, 98 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: lui a5, 780943 -; RV32IM-NEXT: addi a5, a5, 1809 -; RV32IM-NEXT: mulh a5, a1, a5 -; RV32IM-NEXT: srli a6, a5, 31 -; RV32IM-NEXT: srli a5, a5, 8 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: li a6, -1003 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: sh a2, 0(a0) -; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: sh a3, 0(a0) +; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a4, 4(a0) ; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: ret @@ -145,51 +145,51 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI0_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI0_0)(a2) ; RV64IM-NEXT: lh a3, 0(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a5, 16(a1) +; RV64IM-NEXT: lh a2, 8(a1) +; RV64IM-NEXT: lh a4, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: mulh a2, a3, a2 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: srli a6, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a6 +; RV64IM-NEXT: lui a5, %hi(.LCPI0_0) ; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) +; RV64IM-NEXT: lui a7, %hi(.LCPI0_2) +; RV64IM-NEXT: lui t0, %hi(.LCPI0_3) +; RV64IM-NEXT: ld a5, %lo(.LCPI0_0)(a5) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) -; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a2, a2, a7 -; RV64IM-NEXT: subw a3, a3, a2 -; RV64IM-NEXT: mulh a2, a4, a6 -; RV64IM-NEXT: sub a2, a2, a4 -; RV64IM-NEXT: srli a6, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: lui a6, %hi(.LCPI0_2) -; RV64IM-NEXT: ld a6, %lo(.LCPI0_2)(a6) -; RV64IM-NEXT: li a7, -124 -; RV64IM-NEXT: mul a2, a2, a7 -; RV64IM-NEXT: subw a4, a4, a2 -; RV64IM-NEXT: mulh a2, a5, a6 -; RV64IM-NEXT: srli a6, a2, 63 -; RV64IM-NEXT: srli a2, a2, 5 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: lui a6, %hi(.LCPI0_3) -; RV64IM-NEXT: ld a6, %lo(.LCPI0_3)(a6) -; RV64IM-NEXT: li a7, 98 -; RV64IM-NEXT: mul a2, a2, a7 -; RV64IM-NEXT: subw a5, a5, a2 -; RV64IM-NEXT: mulh a2, a1, a6 -; RV64IM-NEXT: srli a6, a2, 63 -; RV64IM-NEXT: srli a2, a2, 7 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: li a6, -1003 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: ld a7, %lo(.LCPI0_2)(a7) +; RV64IM-NEXT: ld t0, %lo(.LCPI0_3)(t0) +; RV64IM-NEXT: mulh a5, a3, a5 +; RV64IM-NEXT: mulh a6, a2, a6 +; RV64IM-NEXT: mulh a7, a4, a7 +; RV64IM-NEXT: mulh t0, a1, t0 +; RV64IM-NEXT: add a5, a5, a3 +; RV64IM-NEXT: sub a6, a6, a2 +; RV64IM-NEXT: srli t1, a7, 63 +; RV64IM-NEXT: srli a7, a7, 5 +; RV64IM-NEXT: add a7, a7, t1 +; RV64IM-NEXT: srli t1, t0, 63 +; RV64IM-NEXT: srli t0, t0, 7 +; RV64IM-NEXT: add t0, t0, t1 +; RV64IM-NEXT: srli t1, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a5, a5, t1 +; RV64IM-NEXT: srli t1, a6, 63 +; RV64IM-NEXT: srli a6, a6, 6 +; RV64IM-NEXT: add a6, a6, t1 +; RV64IM-NEXT: li t1, 98 +; RV64IM-NEXT: mul a7, a7, t1 +; RV64IM-NEXT: li t1, -1003 +; RV64IM-NEXT: mul t0, t0, t1 +; RV64IM-NEXT: li t1, 95 +; RV64IM-NEXT: mul a5, a5, t1 +; RV64IM-NEXT: li t1, -124 +; RV64IM-NEXT: mul a6, a6, t1 +; RV64IM-NEXT: subw a4, a4, a7 +; RV64IM-NEXT: subw a1, a1, t0 +; RV64IM-NEXT: subw a3, a3, a5 +; RV64IM-NEXT: subw a2, a2, a6 ; RV64IM-NEXT: sh a3, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) -; RV64IM-NEXT: sh a5, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, @@ -248,33 +248,33 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 ; RV32IM-NEXT: mulh a6, a2, a5 -; RV32IM-NEXT: add a6, a6, a2 -; RV32IM-NEXT: srli a7, a6, 31 -; RV32IM-NEXT: srli a6, a6, 6 -; RV32IM-NEXT: add a6, a6, a7 -; RV32IM-NEXT: li a7, 95 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a2, a2, a6 -; RV32IM-NEXT: mulh a6, a3, a5 -; RV32IM-NEXT: add a6, a6, a3 -; RV32IM-NEXT: srli t0, a6, 31 -; RV32IM-NEXT: srli a6, a6, 6 -; RV32IM-NEXT: add a6, a6, t0 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a3, a3, a6 -; RV32IM-NEXT: mulh a6, a4, a5 -; RV32IM-NEXT: add a6, a6, a4 -; RV32IM-NEXT: srli t0, a6, 31 -; RV32IM-NEXT: srli a6, a6, 6 -; RV32IM-NEXT: add a6, a6, t0 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a4, a4, a6 +; RV32IM-NEXT: mulh a7, a3, a5 +; RV32IM-NEXT: mulh t0, a4, a5 ; RV32IM-NEXT: mulh a5, a1, a5 +; RV32IM-NEXT: add a6, a6, a2 +; RV32IM-NEXT: add a7, a7, a3 +; RV32IM-NEXT: add t0, t0, a4 ; RV32IM-NEXT: add a5, a5, a1 -; RV32IM-NEXT: srli a6, a5, 31 +; RV32IM-NEXT: srli t1, a6, 31 +; RV32IM-NEXT: srli a6, a6, 6 +; RV32IM-NEXT: add a6, a6, t1 +; RV32IM-NEXT: srli t1, a7, 31 +; RV32IM-NEXT: srli a7, a7, 6 +; RV32IM-NEXT: add a7, a7, t1 +; RV32IM-NEXT: srli t1, t0, 31 +; RV32IM-NEXT: srli t0, t0, 6 +; RV32IM-NEXT: add t0, t0, t1 +; RV32IM-NEXT: srli t1, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: mul a5, a5, a7 +; RV32IM-NEXT: add a5, a5, t1 +; RV32IM-NEXT: li t1, 95 +; RV32IM-NEXT: mul a6, a6, t1 +; RV32IM-NEXT: mul a7, a7, t1 +; RV32IM-NEXT: mul t0, t0, t1 +; RV32IM-NEXT: mul a5, a5, t1 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: sub a3, a3, a7 +; RV32IM-NEXT: sub a4, a4, t0 ; RV32IM-NEXT: sub a1, a1, a5 ; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: sh a3, 2(a0) @@ -333,33 +333,33 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lh a5, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) ; RV64IM-NEXT: mulh a6, a3, a2 -; RV64IM-NEXT: add a6, a6, a3 -; RV64IM-NEXT: srli a7, a6, 63 -; RV64IM-NEXT: srli a6, a6, 6 -; RV64IM-NEXT: add a6, a6, a7 -; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: mulh a6, a4, a2 -; RV64IM-NEXT: add a6, a6, a4 -; RV64IM-NEXT: srli t0, a6, 63 -; RV64IM-NEXT: srli a6, a6, 6 -; RV64IM-NEXT: add a6, a6, t0 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a4, a4, a6 -; RV64IM-NEXT: mulh a6, a5, a2 -; RV64IM-NEXT: add a6, a6, a5 -; RV64IM-NEXT: srli t0, a6, 63 -; RV64IM-NEXT: srli a6, a6, 6 -; RV64IM-NEXT: add a6, a6, t0 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a5, a5, a6 +; RV64IM-NEXT: mulh a7, a4, a2 +; RV64IM-NEXT: mulh t0, a5, a2 ; RV64IM-NEXT: mulh a2, a1, a2 +; RV64IM-NEXT: add a6, a6, a3 +; RV64IM-NEXT: add a7, a7, a4 +; RV64IM-NEXT: add t0, t0, a5 ; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli t1, a6, 63 +; RV64IM-NEXT: srli a6, a6, 6 +; RV64IM-NEXT: add a6, a6, t1 +; RV64IM-NEXT: srli t1, a7, 63 +; RV64IM-NEXT: srli a7, a7, 6 +; RV64IM-NEXT: add a7, a7, t1 +; RV64IM-NEXT: srli t1, t0, 63 +; RV64IM-NEXT: srli t0, t0, 6 +; RV64IM-NEXT: add t0, t0, t1 +; RV64IM-NEXT: srli t1, a2, 63 ; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: add a2, a2, t1 +; RV64IM-NEXT: li t1, 95 +; RV64IM-NEXT: mul a6, a6, t1 +; RV64IM-NEXT: mul a7, a7, t1 +; RV64IM-NEXT: mul t0, t0, t1 +; RV64IM-NEXT: mul a2, a2, t1 +; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: subw a4, a4, a7 +; RV64IM-NEXT: subw a5, a5, t0 ; RV64IM-NEXT: subw a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) @@ -445,49 +445,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 0(a1) -; RV32IM-NEXT: lh a4, 4(a1) -; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a6, a2, a5 -; RV32IM-NEXT: add a6, a6, a2 -; RV32IM-NEXT: srli a7, a6, 31 -; RV32IM-NEXT: srai a6, a6, 6 -; RV32IM-NEXT: add a6, a6, a7 -; RV32IM-NEXT: li a7, 95 -; RV32IM-NEXT: mul t0, a6, a7 -; RV32IM-NEXT: mulh t1, a1, a5 -; RV32IM-NEXT: add t1, t1, a1 -; RV32IM-NEXT: srli t2, t1, 31 +; RV32IM-NEXT: mulh a7, a1, a5 +; RV32IM-NEXT: mulh t0, a4, a5 +; RV32IM-NEXT: mulh t1, a3, a5 +; RV32IM-NEXT: mulh a5, a2, a5 +; RV32IM-NEXT: add a7, a7, a1 +; RV32IM-NEXT: add t0, t0, a4 +; RV32IM-NEXT: add t1, t1, a3 +; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: srli t2, a7, 31 +; RV32IM-NEXT: srai a7, a7, 6 +; RV32IM-NEXT: srli t3, t0, 31 +; RV32IM-NEXT: srai t0, t0, 6 +; RV32IM-NEXT: srli t4, t1, 31 ; RV32IM-NEXT: srai t1, t1, 6 -; RV32IM-NEXT: add t1, t1, t2 -; RV32IM-NEXT: mul t2, t1, a7 -; RV32IM-NEXT: mulh t3, a4, a5 -; RV32IM-NEXT: add t3, t3, a4 -; RV32IM-NEXT: srli t4, t3, 31 -; RV32IM-NEXT: srai t3, t3, 6 -; RV32IM-NEXT: add t3, t3, t4 -; RV32IM-NEXT: mul t4, t3, a7 -; RV32IM-NEXT: mulh a5, a3, a5 -; RV32IM-NEXT: add a5, a5, a3 ; RV32IM-NEXT: srli t5, a5, 31 ; RV32IM-NEXT: srai a5, a5, 6 +; RV32IM-NEXT: add a7, a7, t2 +; RV32IM-NEXT: add t0, t0, t3 +; RV32IM-NEXT: add t1, t1, t4 ; RV32IM-NEXT: add a5, a5, t5 -; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: add a3, a3, a5 -; RV32IM-NEXT: sub a3, a3, a7 -; RV32IM-NEXT: add a4, a4, t3 -; RV32IM-NEXT: sub a4, a4, t4 -; RV32IM-NEXT: add a1, a1, t1 +; RV32IM-NEXT: mul t2, a7, a6 +; RV32IM-NEXT: mul t3, t0, a6 +; RV32IM-NEXT: mul t4, t1, a6 +; RV32IM-NEXT: mul a6, a5, a6 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: add a3, a3, t1 +; RV32IM-NEXT: add a4, a4, t0 +; RV32IM-NEXT: add a1, a1, a7 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: sub a4, a4, t3 ; RV32IM-NEXT: sub a1, a1, t2 -; RV32IM-NEXT: add a2, a2, a6 -; RV32IM-NEXT: sub a2, a2, t0 -; RV32IM-NEXT: sh a3, 0(a0) -; RV32IM-NEXT: sh a4, 2(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -562,49 +562,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 24(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI2_0)(a3) -; RV64IM-NEXT: lh a4, 0(a1) -; RV64IM-NEXT: lh a5, 8(a1) -; RV64IM-NEXT: lh a1, 16(a1) -; RV64IM-NEXT: mulh a6, a2, a3 -; RV64IM-NEXT: add a6, a6, a2 -; RV64IM-NEXT: srli a7, a6, 63 -; RV64IM-NEXT: srai a6, a6, 6 -; RV64IM-NEXT: add a6, a6, a7 -; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul t0, a6, a7 -; RV64IM-NEXT: mulh t1, a1, a3 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: lh a3, 24(a1) +; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) +; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) +; RV64IM-NEXT: lh a5, 0(a1) +; RV64IM-NEXT: lh a1, 8(a1) +; RV64IM-NEXT: li a6, 95 +; RV64IM-NEXT: mulh a7, a3, a4 +; RV64IM-NEXT: mulh t0, a2, a4 +; RV64IM-NEXT: mulh t1, a1, a4 +; RV64IM-NEXT: mulh a4, a5, a4 +; RV64IM-NEXT: add a7, a7, a3 +; RV64IM-NEXT: add t0, t0, a2 ; RV64IM-NEXT: add t1, t1, a1 -; RV64IM-NEXT: srli t2, t1, 63 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: srli t2, a7, 63 +; RV64IM-NEXT: srai a7, a7, 6 +; RV64IM-NEXT: srli t3, t0, 63 +; RV64IM-NEXT: srai t0, t0, 6 +; RV64IM-NEXT: srli t4, t1, 63 ; RV64IM-NEXT: srai t1, t1, 6 -; RV64IM-NEXT: add t1, t1, t2 -; RV64IM-NEXT: mul t2, t1, a7 -; RV64IM-NEXT: mulh t3, a5, a3 -; RV64IM-NEXT: add t3, t3, a5 -; RV64IM-NEXT: srli t4, t3, 63 -; RV64IM-NEXT: srai t3, t3, 6 -; RV64IM-NEXT: add t3, t3, t4 -; RV64IM-NEXT: mul t4, t3, a7 -; RV64IM-NEXT: mulh a3, a4, a3 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli t5, a3, 63 -; RV64IM-NEXT: srai a3, a3, 6 -; RV64IM-NEXT: add a3, a3, t5 -; RV64IM-NEXT: mul a7, a3, a7 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: subw a3, a3, a7 -; RV64IM-NEXT: add a5, a5, t3 -; RV64IM-NEXT: subw a4, a5, t4 +; RV64IM-NEXT: srli t5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: add a7, a7, t2 +; RV64IM-NEXT: add t0, t0, t3 +; RV64IM-NEXT: add t1, t1, t4 +; RV64IM-NEXT: add a4, a4, t5 +; RV64IM-NEXT: mul t2, a7, a6 +; RV64IM-NEXT: mul t3, t0, a6 +; RV64IM-NEXT: mul t4, t1, a6 +; RV64IM-NEXT: mul a6, a4, a6 +; RV64IM-NEXT: add a4, a5, a4 ; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: subw a1, a1, t2 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: subw a2, a2, t0 -; RV64IM-NEXT: sh a3, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: add a2, a2, t0 +; RV64IM-NEXT: add a3, a3, a7 +; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: subw a1, a1, t4 +; RV64IM-NEXT: subw a2, a2, t3 +; RV64IM-NEXT: subw a3, a3, t2 +; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -666,21 +666,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 +; RV32IM-NEXT: srli a6, a1, 26 +; RV32IM-NEXT: add a6, a1, a6 +; RV32IM-NEXT: andi a6, a6, -64 +; RV32IM-NEXT: sub a1, a1, a6 +; RV32IM-NEXT: srli a6, a2, 27 +; RV32IM-NEXT: add a6, a2, a6 +; RV32IM-NEXT: andi a6, a6, -32 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: srli a6, a3, 29 +; RV32IM-NEXT: add a6, a3, a6 +; RV32IM-NEXT: andi a6, a6, -8 +; RV32IM-NEXT: sub a3, a3, a6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: srli a5, a1, 26 -; RV32IM-NEXT: add a5, a1, a5 -; RV32IM-NEXT: andi a5, a5, -64 -; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: srli a5, a2, 27 -; RV32IM-NEXT: add a5, a2, a5 -; RV32IM-NEXT: andi a5, a5, -32 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: srli a5, a3, 29 -; RV32IM-NEXT: add a5, a3, a5 -; RV32IM-NEXT: andi a5, a5, -8 -; RV32IM-NEXT: sub a3, a3, a5 ; RV32IM-NEXT: sh a1, 0(a0) ; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -728,36 +728,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_srem_power_of_two: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 24(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI3_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI3_0)(a3) +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lh a3, 8(a1) ; RV64IM-NEXT: lh a4, 16(a1) -; RV64IM-NEXT: lh a5, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) -; RV64IM-NEXT: mulh a3, a2, a3 -; RV64IM-NEXT: add a3, a3, a2 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: add a3, a3, a6 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, %hi(.LCPI3_0) +; RV64IM-NEXT: ld a5, %lo(.LCPI3_0)(a5) +; RV64IM-NEXT: srli a6, a2, 58 +; RV64IM-NEXT: add a6, a2, a6 +; RV64IM-NEXT: andi a6, a6, -64 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: srli a6, a3, 59 +; RV64IM-NEXT: add a6, a3, a6 +; RV64IM-NEXT: andi a6, a6, -32 +; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: srli a6, a4, 61 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a6, a4, a6 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: andi a6, a6, -8 +; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: srli a6, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a5, a5, a6 ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: srli a3, a1, 58 -; RV64IM-NEXT: add a3, a1, a3 -; RV64IM-NEXT: andi a3, a3, -64 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: srli a3, a5, 59 -; RV64IM-NEXT: add a3, a5, a3 -; RV64IM-NEXT: andi a3, a3, -32 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: srli a3, a4, 61 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: andi a3, a3, -8 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a1, 0(a0) -; RV64IM-NEXT: sh a5, 2(a0) +; RV64IM-NEXT: mul a5, a5, a6 +; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) -; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -807,35 +807,35 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lh a3, 8(a1) ; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a4, 820904 +; RV32IM-NEXT: lui a5, 729444 +; RV32IM-NEXT: lui a6, 395996 ; RV32IM-NEXT: addi a4, a4, -1903 +; RV32IM-NEXT: addi a5, a5, 713 +; RV32IM-NEXT: addi a6, a6, -2009 ; RV32IM-NEXT: mulh a4, a2, a4 +; RV32IM-NEXT: mulh a5, a3, a5 +; RV32IM-NEXT: mulh a6, a1, a6 ; RV32IM-NEXT: add a4, a4, a2 -; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: add a5, a5, a3 +; RV32IM-NEXT: srli a7, a6, 31 +; RV32IM-NEXT: srli a6, a6, 11 +; RV32IM-NEXT: add a6, a6, a7 +; RV32IM-NEXT: srli a7, a4, 31 ; RV32IM-NEXT: srli a4, a4, 9 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: li a5, 654 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: add a4, a4, a7 +; RV32IM-NEXT: srli a7, a5, 31 +; RV32IM-NEXT: srli a5, a5, 4 +; RV32IM-NEXT: add a5, a5, a7 +; RV32IM-NEXT: lui a7, 1 +; RV32IM-NEXT: addi a7, a7, 1327 +; RV32IM-NEXT: mul a6, a6, a7 +; RV32IM-NEXT: li a7, 654 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: li a7, 23 +; RV32IM-NEXT: mul a5, a5, a7 +; RV32IM-NEXT: sub a1, a1, a6 ; RV32IM-NEXT: sub a2, a2, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: li a5, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sub a3, a3, a5 ; RV32IM-NEXT: sh zero, 0(a0) ; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -880,40 +880,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_srem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI4_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI4_0)(a2) +; RV64IM-NEXT: lh a2, 8(a1) ; RV64IM-NEXT: lh a3, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: mulh a2, a3, a2 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 4 -; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: lui a4, %hi(.LCPI4_0) ; RV64IM-NEXT: lui a5, %hi(.LCPI4_1) +; RV64IM-NEXT: lui a6, %hi(.LCPI4_2) +; RV64IM-NEXT: ld a4, %lo(.LCPI4_0)(a4) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_1)(a5) -; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a2 -; RV64IM-NEXT: mulh a2, a4, a5 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 8 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) -; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) -; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a4, a4, a2 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 11 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: ld a6, %lo(.LCPI4_2)(a6) +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: mulh a5, a2, a5 +; RV64IM-NEXT: mulh a6, a1, a6 +; RV64IM-NEXT: add a4, a4, a3 +; RV64IM-NEXT: srli a7, a5, 63 +; RV64IM-NEXT: srli a5, a5, 8 +; RV64IM-NEXT: add a5, a5, a7 +; RV64IM-NEXT: srli a7, a6, 63 +; RV64IM-NEXT: srli a6, a6, 11 +; RV64IM-NEXT: add a6, a6, a7 +; RV64IM-NEXT: srli a7, a4, 63 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: li a7, 654 +; RV64IM-NEXT: mul a5, a5, a7 +; RV64IM-NEXT: lui a7, 1 +; RV64IM-NEXT: addi a7, a7, 1327 +; RV64IM-NEXT: mul a6, a6, a7 +; RV64IM-NEXT: li a7, 23 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: subw a2, a2, a5 +; RV64IM-NEXT: subw a1, a1, a6 +; RV64IM-NEXT: subw a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret @@ -931,8 +931,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh a2, 4(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a2, 4(a1) ; RV32I-NEXT: lh a0, 8(a1) ; RV32I-NEXT: lh s1, 12(a1) ; RV32I-NEXT: srli a1, a2, 17 @@ -965,30 +965,30 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lh a3, 8(a1) ; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a4, 729444 +; RV32IM-NEXT: lui a5, 395996 +; RV32IM-NEXT: lui a6, 8 ; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: addi a5, a5, -2009 ; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: mulh a5, a1, a5 +; RV32IM-NEXT: srli a7, a5, 31 +; RV32IM-NEXT: srli a5, a5, 11 +; RV32IM-NEXT: add a5, a5, a7 +; RV32IM-NEXT: srli a7, a2, 17 ; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: add a7, a2, a7 +; RV32IM-NEXT: and a6, a7, a6 +; RV32IM-NEXT: srli a7, a4, 31 ; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: li a5, 23 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: add a4, a4, a7 +; RV32IM-NEXT: lui a7, 1 +; RV32IM-NEXT: addi a7, a7, 1327 +; RV32IM-NEXT: mul a5, a5, a7 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: li a6, 23 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub a1, a1, a5 ; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: srli a4, a2, 17 -; RV32IM-NEXT: add a4, a2, a4 -; RV32IM-NEXT: lui a5, 8 -; RV32IM-NEXT: and a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: sh zero, 0(a0) ; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -1003,8 +1003,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh a2, 8(a1) ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a2, 8(a1) ; RV64I-NEXT: lh a0, 16(a1) ; RV64I-NEXT: lh s1, 24(a1) ; RV64I-NEXT: srli a1, a2, 49 @@ -1033,36 +1033,36 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_i16_smax: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI5_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI5_0)(a2) +; RV64IM-NEXT: lh a2, 8(a1) ; RV64IM-NEXT: lh a3, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: mulh a2, a3, a2 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 4 -; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: lui a4, %hi(.LCPI5_0) ; RV64IM-NEXT: lui a5, %hi(.LCPI5_1) ; RV64IM-NEXT: ld a5, %lo(.LCPI5_1)(a5) +; RV64IM-NEXT: lui a6, 8 +; RV64IM-NEXT: ld a4, %lo(.LCPI5_0)(a4) +; RV64IM-NEXT: srli a7, a2, 49 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a7, a2, a7 +; RV64IM-NEXT: and a6, a7, a6 +; RV64IM-NEXT: srli a7, a5, 63 +; RV64IM-NEXT: srli a5, a5, 11 +; RV64IM-NEXT: add a5, a5, a7 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: add a4, a4, a3 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: srli a6, a4, 63 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a6 +; RV64IM-NEXT: lui a6, 1 +; RV64IM-NEXT: addi a6, a6, 1327 +; RV64IM-NEXT: mul a5, a5, a6 ; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a2 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 11 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: subw a1, a1, a2 -; RV64IM-NEXT: srli a2, a4, 49 -; RV64IM-NEXT: add a2, a4, a2 -; RV64IM-NEXT: lui a5, 8 -; RV64IM-NEXT: and a2, a2, a5 -; RV64IM-NEXT: subw a4, a4, a2 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: subw a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret @@ -1250,40 +1250,40 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_srem_i64: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI6_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64IM-NEXT: ld a2, 8(a1) ; RV64IM-NEXT: ld a3, 16(a1) -; RV64IM-NEXT: ld a4, 8(a1) ; RV64IM-NEXT: ld a1, 24(a1) -; RV64IM-NEXT: mulh a2, a3, a2 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srai a2, a2, 4 -; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: lui a4, %hi(.LCPI6_0) ; RV64IM-NEXT: lui a5, %hi(.LCPI6_1) +; RV64IM-NEXT: lui a6, %hi(.LCPI6_2) +; RV64IM-NEXT: ld a4, %lo(.LCPI6_0)(a4) ; RV64IM-NEXT: ld a5, %lo(.LCPI6_1)(a5) -; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: sub a3, a3, a2 -; RV64IM-NEXT: mulh a2, a4, a5 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srai a2, a2, 8 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: lui a5, %hi(.LCPI6_2) -; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5) -; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: sub a4, a4, a2 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srai a2, a2, 11 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: ld a6, %lo(.LCPI6_2)(a6) +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: mulh a5, a2, a5 +; RV64IM-NEXT: mulh a6, a1, a6 +; RV64IM-NEXT: add a4, a4, a3 +; RV64IM-NEXT: srli a7, a5, 63 +; RV64IM-NEXT: srai a5, a5, 8 +; RV64IM-NEXT: add a5, a5, a7 +; RV64IM-NEXT: srli a7, a6, 63 +; RV64IM-NEXT: srai a6, a6, 11 +; RV64IM-NEXT: add a6, a6, a7 +; RV64IM-NEXT: srli a7, a4, 63 +; RV64IM-NEXT: srai a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: li a7, 654 +; RV64IM-NEXT: mul a5, a5, a7 +; RV64IM-NEXT: lui a7, 1 +; RV64IM-NEXT: addiw a7, a7, 1327 +; RV64IM-NEXT: mul a6, a6, a7 +; RV64IM-NEXT: li a7, 23 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a6 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sd zero, 0(a0) -; RV64IM-NEXT: sd a4, 8(a0) +; RV64IM-NEXT: sd a2, 8(a0) ; RV64IM-NEXT: sd a3, 16(a0) ; RV64IM-NEXT: sd a1, 24(a0) ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll index d113afa769931..0499992b71778 100644 --- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll @@ -44,11 +44,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: sext.w a0, a0 ; RV64IZbb-NEXT: mulw a1, a1, a2 +; RV64IZbb-NEXT: lui a2, 524288 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: lui a1, 524288 -; RV64IZbb-NEXT: addiw a2, a1, -1 -; RV64IZbb-NEXT: min a0, a0, a2 -; RV64IZbb-NEXT: max a0, a0, a1 +; RV64IZbb-NEXT: addiw a1, a2, -1 +; RV64IZbb-NEXT: min a0, a0, a1 +; RV64IZbb-NEXT: max a0, a0, a2 ; RV64IZbb-NEXT: ret %a = mul i32 %y, %z %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %a) @@ -98,13 +98,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV32I-LABEL: func16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: mul a1, a1, a2 +; RV32I-NEXT: lui a2, 8 +; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 8 -; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: addi a1, a2, -1 ; RV32I-NEXT: bge a0, a1, .LBB2_3 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: lui a1, 1048568 @@ -122,13 +122,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV64I-LABEL: func16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: lui a2, 8 +; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: slli a1, a1, 48 ; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 8 -; RV64I-NEXT: addiw a1, a1, -1 +; RV64I-NEXT: addiw a1, a2, -1 ; RV64I-NEXT: bge a0, a1, .LBB2_3 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: lui a1, 1048568 @@ -147,11 +147,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: sext.h a0, a0 ; RV32IZbb-NEXT: mul a1, a1, a2 +; RV32IZbb-NEXT: lui a2, 8 ; RV32IZbb-NEXT: sext.h a1, a1 +; RV32IZbb-NEXT: addi a2, a2, -1 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: lui a1, 8 -; RV32IZbb-NEXT: addi a1, a1, -1 -; RV32IZbb-NEXT: min a0, a0, a1 +; RV32IZbb-NEXT: min a0, a0, a2 ; RV32IZbb-NEXT: lui a1, 1048568 ; RV32IZbb-NEXT: max a0, a0, a1 ; RV32IZbb-NEXT: ret @@ -160,11 +160,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: sext.h a0, a0 ; RV64IZbb-NEXT: mul a1, a1, a2 +; RV64IZbb-NEXT: lui a2, 8 ; RV64IZbb-NEXT: sext.h a1, a1 +; RV64IZbb-NEXT: addiw a2, a2, -1 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: lui a1, 8 -; RV64IZbb-NEXT: addiw a1, a1, -1 -; RV64IZbb-NEXT: min a0, a0, a1 +; RV64IZbb-NEXT: min a0, a0, a2 ; RV64IZbb-NEXT: lui a1, 1048568 ; RV64IZbb-NEXT: max a0, a0, a1 ; RV64IZbb-NEXT: ret @@ -177,8 +177,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; RV32I-LABEL: func8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: mul a1, a1, a2 +; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: sub a0, a0, a1 @@ -200,8 +200,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; RV64I-LABEL: func8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: slli a1, a1, 56 ; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: sub a0, a0, a1 @@ -224,10 +224,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: sext.b a0, a0 ; RV32IZbb-NEXT: mul a1, a1, a2 +; RV32IZbb-NEXT: li a2, 127 ; RV32IZbb-NEXT: sext.b a1, a1 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: li a1, 127 -; RV32IZbb-NEXT: min a0, a0, a1 +; RV32IZbb-NEXT: min a0, a0, a2 ; RV32IZbb-NEXT: li a1, -128 ; RV32IZbb-NEXT: max a0, a0, a1 ; RV32IZbb-NEXT: ret @@ -236,10 +236,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: sext.b a0, a0 ; RV64IZbb-NEXT: mul a1, a1, a2 +; RV64IZbb-NEXT: li a2, 127 ; RV64IZbb-NEXT: sext.b a1, a1 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: li a1, 127 -; RV64IZbb-NEXT: min a0, a0, a1 +; RV64IZbb-NEXT: min a0, a0, a2 ; RV64IZbb-NEXT: li a1, -128 ; RV64IZbb-NEXT: max a0, a0, a1 ; RV64IZbb-NEXT: ret @@ -252,8 +252,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind { ; RV32I-LABEL: func4: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 28 -; RV32I-NEXT: srai a0, a0, 28 ; RV32I-NEXT: mul a1, a1, a2 +; RV32I-NEXT: srai a0, a0, 28 ; RV32I-NEXT: slli a1, a1, 28 ; RV32I-NEXT: srai a1, a1, 28 ; RV32I-NEXT: sub a0, a0, a1 @@ -275,8 +275,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind { ; RV64I-LABEL: func4: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 60 -; RV64I-NEXT: srai a0, a0, 60 ; RV64I-NEXT: mul a1, a1, a2 +; RV64I-NEXT: srai a0, a0, 60 ; RV64I-NEXT: slli a1, a1, 60 ; RV64I-NEXT: srai a1, a1, 60 ; RV64I-NEXT: sub a0, a0, a1 @@ -298,13 +298,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind { ; RV32IZbb-LABEL: func4: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: slli a0, a0, 28 -; RV32IZbb-NEXT: srai a0, a0, 28 ; RV32IZbb-NEXT: mul a1, a1, a2 +; RV32IZbb-NEXT: li a2, 7 +; RV32IZbb-NEXT: srai a0, a0, 28 ; RV32IZbb-NEXT: slli a1, a1, 28 ; RV32IZbb-NEXT: srai a1, a1, 28 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: li a1, 7 -; RV32IZbb-NEXT: min a0, a0, a1 +; RV32IZbb-NEXT: min a0, a0, a2 ; RV32IZbb-NEXT: li a1, -8 ; RV32IZbb-NEXT: max a0, a0, a1 ; RV32IZbb-NEXT: ret @@ -312,13 +312,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind { ; RV64IZbb-LABEL: func4: ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: slli a0, a0, 60 -; RV64IZbb-NEXT: srai a0, a0, 60 ; RV64IZbb-NEXT: mul a1, a1, a2 +; RV64IZbb-NEXT: li a2, 7 +; RV64IZbb-NEXT: srai a0, a0, 60 ; RV64IZbb-NEXT: slli a1, a1, 60 ; RV64IZbb-NEXT: srai a1, a1, 60 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: li a1, 7 -; RV64IZbb-NEXT: min a0, a0, a1 +; RV64IZbb-NEXT: min a0, a0, a2 ; RV64IZbb-NEXT: li a1, -8 ; RV64IZbb-NEXT: max a0, a0, a1 ; RV64IZbb-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index 052ccbf6e06f3..cd1aebfea5ce4 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -266,13 +266,13 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: lw a4, 156(sp) ; CHECK-NEXT: lw a5, 160(sp) ; CHECK-NEXT: lw a6, 164(sp) -; CHECK-NEXT: lui a0, 786400 +; CHECK-NEXT: lui a2, 786400 +; CHECK-NEXT: addi a0, sp, 104 +; CHECK-NEXT: addi a1, sp, 88 ; CHECK-NEXT: sw zero, 72(sp) ; CHECK-NEXT: sw zero, 76(sp) ; CHECK-NEXT: sw zero, 80(sp) -; CHECK-NEXT: sw a0, 84(sp) -; CHECK-NEXT: addi a0, sp, 104 -; CHECK-NEXT: addi a1, sp, 88 +; CHECK-NEXT: sw a2, 84(sp) ; CHECK-NEXT: addi a2, sp, 72 ; CHECK-NEXT: sw a3, 88(sp) ; CHECK-NEXT: sw a4, 92(sp) diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index f7ef01b0958d8..366b37ac5d472 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -43,9 +43,9 @@ define void @caller_extern(ptr %src) optsize { ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi1: ; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI1_0) -; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi1)(a1) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi2: ; CHECK-LARGE-ZICFILP-NEXT: auipc a2, %pcrel_hi(.LCPI1_1) +; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi1)(a1) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi2)(a2) ; CHECK-LARGE-ZICFILP-NEXT: li a2, 7 ; CHECK-LARGE-ZICFILP-NEXT: mv a3, a0 @@ -75,9 +75,9 @@ define void @caller_extern_pgso(ptr %src) !prof !14 { ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi3: ; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI2_0) -; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi3)(a1) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi4: ; CHECK-LARGE-ZICFILP-NEXT: auipc a2, %pcrel_hi(.LCPI2_1) +; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi3)(a1) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi4)(a2) ; CHECK-LARGE-ZICFILP-NEXT: li a2, 7 ; CHECK-LARGE-ZICFILP-NEXT: mv a3, a0 @@ -465,9 +465,9 @@ define void @caller_nostruct() nounwind { ; CHECK-LARGE-ZICFILP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi13: ; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI11_0) -; CHECK-LARGE-ZICFILP-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi13)(a0) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi14: ; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI11_1) +; CHECK-LARGE-ZICFILP-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi13)(a0) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi14)(a1) ; CHECK-LARGE-ZICFILP-NEXT: jalr t2 ; CHECK-LARGE-ZICFILP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll b/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll index f270775adcc15..9f81ff8c8d31a 100644 --- a/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll +++ b/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll @@ -16,12 +16,12 @@ define signext i32 @trunc_nuw_nsw_urem(i64 %x) nounwind { ; CHECK-LABEL: trunc_nuw_nsw_urem: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a1, 210 +; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: addiw a1, a1, -1167 ; CHECK-NEXT: slli a1, a1, 12 ; CHECK-NEXT: addi a1, a1, 1881 ; CHECK-NEXT: mul a1, a0, a1 ; CHECK-NEXT: srli a1, a1, 45 -; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: addi a2, a2, 1808 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: subw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll index 23875a7ec5621..409114f8a9612 100644 --- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll @@ -105,28 +105,28 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV32I-LABEL: func16: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a3, 16 -; RV32I-NEXT: addi a3, a3, -1 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: mul a1, a1, a2 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: bltu a0, a3, .LBB2_2 +; RV32I-NEXT: mul a2, a1, a2 +; RV32I-NEXT: addi a1, a3, -1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: and a2, a2, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: bltu a0, a1, .LBB2_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB2_2: ; RV32I-NEXT: ret ; ; RV64I-LABEL: func16: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a3, 16 -; RV64I-NEXT: addiw a3, a3, -1 -; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: mul a1, a1, a2 -; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: bltu a0, a3, .LBB2_2 +; RV64I-NEXT: mul a2, a1, a2 +; RV64I-NEXT: addiw a1, a3, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: bltu a0, a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a0, a3 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: .LBB2_2: ; RV64I-NEXT: ret ; @@ -134,22 +134,22 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: zext.h a0, a0 ; RV32IZbb-NEXT: mul a1, a1, a2 +; RV32IZbb-NEXT: lui a2, 16 ; RV32IZbb-NEXT: zext.h a1, a1 ; RV32IZbb-NEXT: add a0, a0, a1 -; RV32IZbb-NEXT: lui a1, 16 -; RV32IZbb-NEXT: addi a1, a1, -1 -; RV32IZbb-NEXT: minu a0, a0, a1 +; RV32IZbb-NEXT: addi a2, a2, -1 +; RV32IZbb-NEXT: minu a0, a0, a2 ; RV32IZbb-NEXT: ret ; ; RV64IZbb-LABEL: func16: ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: zext.h a0, a0 ; RV64IZbb-NEXT: mul a1, a1, a2 +; RV64IZbb-NEXT: lui a2, 16 ; RV64IZbb-NEXT: zext.h a1, a1 ; RV64IZbb-NEXT: add a0, a0, a1 -; RV64IZbb-NEXT: lui a1, 16 -; RV64IZbb-NEXT: addiw a1, a1, -1 -; RV64IZbb-NEXT: minu a0, a0, a1 +; RV64IZbb-NEXT: addiw a2, a2, -1 +; RV64IZbb-NEXT: minu a0, a0, a2 ; RV64IZbb-NEXT: ret %a = mul i16 %y, %z %tmp = call i16 @llvm.uadd.sat.i16(i16 %x, i16 %a) diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll index dde69667b8ec3..d6fd4f15c4e53 100644 --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -10,99 +10,101 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RISCV32-NEXT: lw a3, 0(a1) -; RISCV32-NEXT: lw t2, 4(a1) -; RISCV32-NEXT: lw a4, 8(a1) -; RISCV32-NEXT: lw a5, 12(a1) -; RISCV32-NEXT: lw a1, 0(a2) -; RISCV32-NEXT: lw t0, 4(a2) -; RISCV32-NEXT: lw a6, 8(a2) -; RISCV32-NEXT: lw a7, 12(a2) -; RISCV32-NEXT: mulhu a2, a3, a1 -; RISCV32-NEXT: mul t1, t2, a1 -; RISCV32-NEXT: add a2, t1, a2 -; RISCV32-NEXT: sltu t1, a2, t1 -; RISCV32-NEXT: mulhu t3, t2, a1 -; RISCV32-NEXT: add t4, t3, t1 -; RISCV32-NEXT: mul t1, a3, t0 -; RISCV32-NEXT: add a2, t1, a2 -; RISCV32-NEXT: sltu t1, a2, t1 -; RISCV32-NEXT: mulhu t3, a3, t0 -; RISCV32-NEXT: add t1, t3, t1 -; RISCV32-NEXT: add t5, t4, t1 -; RISCV32-NEXT: mul t6, t2, t0 -; RISCV32-NEXT: add s0, t6, t5 -; RISCV32-NEXT: mul t1, a6, a3 -; RISCV32-NEXT: mul s3, a4, a1 -; RISCV32-NEXT: add s4, s3, t1 -; RISCV32-NEXT: add t1, s0, s4 -; RISCV32-NEXT: sltu t3, t1, s0 -; RISCV32-NEXT: sltu s0, s0, t6 -; RISCV32-NEXT: sltu t4, t5, t4 -; RISCV32-NEXT: mulhu t5, t2, t0 -; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: add s0, t4, s0 -; RISCV32-NEXT: mul t4, t2, a6 -; RISCV32-NEXT: mul t5, a7, a3 -; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu s1, a6, a3 -; RISCV32-NEXT: add s2, s1, t4 -; RISCV32-NEXT: mul t4, t0, a4 -; RISCV32-NEXT: mul t5, a5, a1 -; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu t5, a4, a1 -; RISCV32-NEXT: add t6, t5, t4 -; RISCV32-NEXT: add t4, t6, s2 -; RISCV32-NEXT: sltu s3, s4, s3 -; RISCV32-NEXT: add t4, t4, s3 -; RISCV32-NEXT: add t4, s0, t4 -; RISCV32-NEXT: add t4, t4, t3 -; RISCV32-NEXT: beq t4, s0, .LBB0_2 +; RISCV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RISCV32-NEXT: lw a4, 0(a1) +; RISCV32-NEXT: lw t0, 4(a1) +; RISCV32-NEXT: lw a3, 8(a1) +; RISCV32-NEXT: lw a1, 12(a1) +; RISCV32-NEXT: lw a6, 0(a2) +; RISCV32-NEXT: lw a5, 4(a2) +; RISCV32-NEXT: lw a7, 8(a2) +; RISCV32-NEXT: lw a2, 12(a2) +; RISCV32-NEXT: mulhu t1, a4, a6 +; RISCV32-NEXT: mul t2, t0, a6 +; RISCV32-NEXT: mulhu t3, t0, a6 +; RISCV32-NEXT: mul t4, a4, a5 +; RISCV32-NEXT: mulhu t5, a4, a5 +; RISCV32-NEXT: mul s2, t0, a5 +; RISCV32-NEXT: mul t6, a7, a4 +; RISCV32-NEXT: mul s3, a3, a6 +; RISCV32-NEXT: mul s0, t0, a7 +; RISCV32-NEXT: mul s1, a2, a4 +; RISCV32-NEXT: mul s4, a5, a3 +; RISCV32-NEXT: add s1, s1, s0 +; RISCV32-NEXT: mul s0, a1, a6 +; RISCV32-NEXT: add s4, s0, s4 +; RISCV32-NEXT: mulhu s5, t0, a5 +; RISCV32-NEXT: add t1, t2, t1 +; RISCV32-NEXT: sltu t2, t1, t2 +; RISCV32-NEXT: add t2, t3, t2 +; RISCV32-NEXT: mulhu s0, a7, a4 +; RISCV32-NEXT: add t1, t4, t1 +; RISCV32-NEXT: sltu t3, t1, t4 +; RISCV32-NEXT: add t3, t5, t3 +; RISCV32-NEXT: mulhu t5, a3, a6 +; RISCV32-NEXT: add t4, s3, t6 +; RISCV32-NEXT: add s1, s0, s1 +; RISCV32-NEXT: add t6, t5, s4 +; RISCV32-NEXT: sltu s3, t4, s3 +; RISCV32-NEXT: add t3, t2, t3 +; RISCV32-NEXT: sltu t2, t3, t2 +; RISCV32-NEXT: add s5, s5, t2 +; RISCV32-NEXT: add s4, t6, s1 +; RISCV32-NEXT: add t3, s2, t3 +; RISCV32-NEXT: add t2, t3, t4 +; RISCV32-NEXT: sltu s2, t3, s2 +; RISCV32-NEXT: sltu t4, t2, t3 +; RISCV32-NEXT: add s2, s5, s2 +; RISCV32-NEXT: add s3, s4, s3 +; RISCV32-NEXT: add t3, s2, s3 +; RISCV32-NEXT: add t3, t3, t4 +; RISCV32-NEXT: beq t3, s2, .LBB0_2 ; RISCV32-NEXT: # %bb.1: # %start -; RISCV32-NEXT: sltu t3, t4, s0 +; RISCV32-NEXT: sltu t4, t3, s2 ; RISCV32-NEXT: .LBB0_2: # %start -; RISCV32-NEXT: sltu s0, s2, s1 -; RISCV32-NEXT: snez s1, t2 -; RISCV32-NEXT: snez s2, a7 -; RISCV32-NEXT: and s1, s2, s1 -; RISCV32-NEXT: mulhu s2, a7, a3 -; RISCV32-NEXT: snez s2, s2 -; RISCV32-NEXT: or s1, s1, s2 -; RISCV32-NEXT: mulhu t2, t2, a6 -; RISCV32-NEXT: snez t2, t2 -; RISCV32-NEXT: or t2, s1, t2 -; RISCV32-NEXT: or t2, t2, s0 +; RISCV32-NEXT: sltu s0, s1, s0 +; RISCV32-NEXT: snez s1, t0 +; RISCV32-NEXT: snez s2, a2 ; RISCV32-NEXT: sltu t5, t6, t5 -; RISCV32-NEXT: snez t6, t0 -; RISCV32-NEXT: snez s0, a5 -; RISCV32-NEXT: and t6, s0, t6 -; RISCV32-NEXT: mulhu s0, a5, a1 -; RISCV32-NEXT: snez s0, s0 -; RISCV32-NEXT: or t6, t6, s0 -; RISCV32-NEXT: mulhu t0, t0, a4 +; RISCV32-NEXT: mulhu t6, a2, a4 +; RISCV32-NEXT: mulhu t0, t0, a7 +; RISCV32-NEXT: or a2, a7, a2 +; RISCV32-NEXT: snez a7, a5 +; RISCV32-NEXT: mul a4, a4, a6 +; RISCV32-NEXT: mulhu a6, a1, a6 +; RISCV32-NEXT: mulhu a5, a5, a3 +; RISCV32-NEXT: or a3, a3, a1 +; RISCV32-NEXT: snez a1, a1 +; RISCV32-NEXT: and s1, s2, s1 +; RISCV32-NEXT: snez t6, t6 ; RISCV32-NEXT: snez t0, t0 -; RISCV32-NEXT: or t0, t6, t0 -; RISCV32-NEXT: or t0, t0, t5 -; RISCV32-NEXT: or a6, a6, a7 +; RISCV32-NEXT: and a1, a1, a7 ; RISCV32-NEXT: snez a6, a6 -; RISCV32-NEXT: or a4, a4, a5 -; RISCV32-NEXT: snez a4, a4 -; RISCV32-NEXT: and a4, a4, a6 -; RISCV32-NEXT: or a4, a4, t0 -; RISCV32-NEXT: or a4, a4, t2 -; RISCV32-NEXT: or a4, a4, t3 -; RISCV32-NEXT: mul a1, a3, a1 -; RISCV32-NEXT: andi a4, a4, 1 -; RISCV32-NEXT: sw a1, 0(a0) -; RISCV32-NEXT: sw a2, 4(a0) -; RISCV32-NEXT: sw t1, 8(a0) -; RISCV32-NEXT: sw t4, 12(a0) -; RISCV32-NEXT: sb a4, 16(a0) +; RISCV32-NEXT: snez a5, a5 +; RISCV32-NEXT: snez a2, a2 +; RISCV32-NEXT: snez a3, a3 +; RISCV32-NEXT: or a7, s1, t6 +; RISCV32-NEXT: or a1, a1, a6 +; RISCV32-NEXT: and a2, a3, a2 +; RISCV32-NEXT: or a3, a7, t0 +; RISCV32-NEXT: or a1, a1, a5 +; RISCV32-NEXT: or a3, a3, s0 +; RISCV32-NEXT: or a1, a1, t5 +; RISCV32-NEXT: or a1, a2, a1 +; RISCV32-NEXT: or a1, a1, a3 +; RISCV32-NEXT: or a1, a1, t4 +; RISCV32-NEXT: andi a1, a1, 1 +; RISCV32-NEXT: sw a4, 0(a0) +; RISCV32-NEXT: sw t1, 4(a0) +; RISCV32-NEXT: sw t2, 8(a0) +; RISCV32-NEXT: sw t3, 12(a0) +; RISCV32-NEXT: sb a1, 16(a0) ; RISCV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RISCV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload ; RISCV32-NEXT: addi sp, sp, 32 ; RISCV32-NEXT: ret start: diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index a2f5e446b63bc..1cdfaa5c4154b 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -133,50 +133,49 @@ define i64 @load_i64(ptr %p) { ; RV32I-LABEL: load_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 0(a0) -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: lbu a4, 3(a0) +; RV32I-NEXT: lbu a2, 2(a0) +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a2, a4, a3 -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 5(a0) -; RV32I-NEXT: or a2, a2, a1 -; RV32I-NEXT: lbu a1, 6(a0) +; RV32I-NEXT: slli a2, a2, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a5, 5(a0) +; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: lbu a3, 6(a0) ; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: or a1, a0, a3 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: or a3, a0, a3 +; RV32I-NEXT: or a0, a2, a1 +; RV32I-NEXT: or a1, a3, a4 ; RV32I-NEXT: ret ; ; RV64I-LABEL: load_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 0(a0) -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: lbu a4, 3(a0) +; RV64I-NEXT: lbu a2, 2(a0) +; RV64I-NEXT: lbu a3, 3(a0) +; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a2, 4(a0) -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: slli a2, a2, 16 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: lbu a4, 4(a0) +; RV64I-NEXT: lbu a5, 5(a0) +; RV64I-NEXT: or a2, a3, a2 ; RV64I-NEXT: lbu a3, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a2, a4, a2 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -187,16 +186,16 @@ define i64 @load_i64(ptr %p) { ; RV32IZBKB-NEXT: lbu a2, 1(a0) ; RV32IZBKB-NEXT: lbu a3, 2(a0) ; RV32IZBKB-NEXT: lbu a4, 3(a0) -; RV32IZBKB-NEXT: lbu a5, 4(a0) -; RV32IZBKB-NEXT: lbu a6, 5(a0) -; RV32IZBKB-NEXT: lbu a7, 6(a0) -; RV32IZBKB-NEXT: lbu t0, 7(a0) -; RV32IZBKB-NEXT: packh a0, a3, a4 +; RV32IZBKB-NEXT: lbu a5, 5(a0) +; RV32IZBKB-NEXT: lbu a6, 6(a0) +; RV32IZBKB-NEXT: lbu a7, 7(a0) +; RV32IZBKB-NEXT: lbu a0, 4(a0) +; RV32IZBKB-NEXT: packh a3, a3, a4 ; RV32IZBKB-NEXT: packh a1, a1, a2 -; RV32IZBKB-NEXT: pack a0, a1, a0 -; RV32IZBKB-NEXT: packh a1, a7, t0 -; RV32IZBKB-NEXT: packh a2, a5, a6 -; RV32IZBKB-NEXT: pack a1, a2, a1 +; RV32IZBKB-NEXT: packh a2, a6, a7 +; RV32IZBKB-NEXT: packh a4, a0, a5 +; RV32IZBKB-NEXT: pack a0, a1, a3 +; RV32IZBKB-NEXT: pack a1, a4, a2 ; RV32IZBKB-NEXT: ret ; ; RV64IZBKB-LABEL: load_i64: @@ -205,18 +204,18 @@ define i64 @load_i64(ptr %p) { ; RV64IZBKB-NEXT: lbu a2, 5(a0) ; RV64IZBKB-NEXT: lbu a3, 6(a0) ; RV64IZBKB-NEXT: lbu a4, 7(a0) +; RV64IZBKB-NEXT: lbu a5, 0(a0) +; RV64IZBKB-NEXT: lbu a6, 1(a0) +; RV64IZBKB-NEXT: lbu a7, 2(a0) +; RV64IZBKB-NEXT: lbu a0, 3(a0) ; RV64IZBKB-NEXT: packh a1, a1, a2 ; RV64IZBKB-NEXT: packh a2, a3, a4 -; RV64IZBKB-NEXT: lbu a3, 0(a0) -; RV64IZBKB-NEXT: lbu a4, 1(a0) -; RV64IZBKB-NEXT: lbu a5, 2(a0) -; RV64IZBKB-NEXT: lbu a0, 3(a0) +; RV64IZBKB-NEXT: packh a3, a5, a6 +; RV64IZBKB-NEXT: packh a0, a7, a0 ; RV64IZBKB-NEXT: slli a2, a2, 16 -; RV64IZBKB-NEXT: or a1, a2, a1 -; RV64IZBKB-NEXT: packh a2, a3, a4 -; RV64IZBKB-NEXT: packh a0, a5, a0 ; RV64IZBKB-NEXT: slli a0, a0, 16 -; RV64IZBKB-NEXT: or a0, a0, a2 +; RV64IZBKB-NEXT: or a1, a2, a1 +; RV64IZBKB-NEXT: or a0, a0, a3 ; RV64IZBKB-NEXT: pack a0, a0, a1 ; RV64IZBKB-NEXT: ret ; @@ -306,17 +305,17 @@ define void @store_i64(ptr %p, i64 %v) { ; RV32I-NEXT: srli a3, a2, 24 ; RV32I-NEXT: srli a4, a2, 16 ; RV32I-NEXT: srli a5, a2, 8 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli a7, a1, 16 ; RV32I-NEXT: sb a2, 4(a0) ; RV32I-NEXT: sb a5, 5(a0) ; RV32I-NEXT: sb a4, 6(a0) ; RV32I-NEXT: sb a3, 7(a0) -; RV32I-NEXT: srli a2, a1, 24 -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 8 +; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: sb a1, 0(a0) -; RV32I-NEXT: sb a4, 1(a0) -; RV32I-NEXT: sb a3, 2(a0) -; RV32I-NEXT: sb a2, 3(a0) +; RV32I-NEXT: sb a2, 1(a0) +; RV32I-NEXT: sb a7, 2(a0) +; RV32I-NEXT: sb a6, 3(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: store_i64: @@ -325,17 +324,17 @@ define void @store_i64(ptr %p, i64 %v) { ; RV64I-NEXT: srli a3, a1, 48 ; RV64I-NEXT: srli a4, a1, 40 ; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srli a6, a1, 24 +; RV64I-NEXT: srli a7, a1, 16 ; RV64I-NEXT: sb a5, 4(a0) ; RV64I-NEXT: sb a4, 5(a0) ; RV64I-NEXT: sb a3, 6(a0) ; RV64I-NEXT: sb a2, 7(a0) -; RV64I-NEXT: srli a2, a1, 24 -; RV64I-NEXT: srli a3, a1, 16 -; RV64I-NEXT: srli a4, a1, 8 +; RV64I-NEXT: srli a2, a1, 8 ; RV64I-NEXT: sb a1, 0(a0) -; RV64I-NEXT: sb a4, 1(a0) -; RV64I-NEXT: sb a3, 2(a0) -; RV64I-NEXT: sb a2, 3(a0) +; RV64I-NEXT: sb a2, 1(a0) +; RV64I-NEXT: sb a7, 2(a0) +; RV64I-NEXT: sb a6, 3(a0) ; RV64I-NEXT: ret ; ; RV32IZBKB-LABEL: store_i64: @@ -343,17 +342,17 @@ define void @store_i64(ptr %p, i64 %v) { ; RV32IZBKB-NEXT: srli a3, a2, 24 ; RV32IZBKB-NEXT: srli a4, a2, 16 ; RV32IZBKB-NEXT: srli a5, a2, 8 +; RV32IZBKB-NEXT: srli a6, a1, 24 +; RV32IZBKB-NEXT: srli a7, a1, 16 ; RV32IZBKB-NEXT: sb a2, 4(a0) ; RV32IZBKB-NEXT: sb a5, 5(a0) ; RV32IZBKB-NEXT: sb a4, 6(a0) ; RV32IZBKB-NEXT: sb a3, 7(a0) -; RV32IZBKB-NEXT: srli a2, a1, 24 -; RV32IZBKB-NEXT: srli a3, a1, 16 -; RV32IZBKB-NEXT: srli a4, a1, 8 +; RV32IZBKB-NEXT: srli a2, a1, 8 ; RV32IZBKB-NEXT: sb a1, 0(a0) -; RV32IZBKB-NEXT: sb a4, 1(a0) -; RV32IZBKB-NEXT: sb a3, 2(a0) -; RV32IZBKB-NEXT: sb a2, 3(a0) +; RV32IZBKB-NEXT: sb a2, 1(a0) +; RV32IZBKB-NEXT: sb a7, 2(a0) +; RV32IZBKB-NEXT: sb a6, 3(a0) ; RV32IZBKB-NEXT: ret ; ; RV64IZBKB-LABEL: store_i64: @@ -362,17 +361,17 @@ define void @store_i64(ptr %p, i64 %v) { ; RV64IZBKB-NEXT: srli a3, a1, 48 ; RV64IZBKB-NEXT: srli a4, a1, 40 ; RV64IZBKB-NEXT: srli a5, a1, 32 +; RV64IZBKB-NEXT: srli a6, a1, 24 +; RV64IZBKB-NEXT: srli a7, a1, 16 ; RV64IZBKB-NEXT: sb a5, 4(a0) ; RV64IZBKB-NEXT: sb a4, 5(a0) ; RV64IZBKB-NEXT: sb a3, 6(a0) ; RV64IZBKB-NEXT: sb a2, 7(a0) -; RV64IZBKB-NEXT: srli a2, a1, 24 -; RV64IZBKB-NEXT: srli a3, a1, 16 -; RV64IZBKB-NEXT: srli a4, a1, 8 +; RV64IZBKB-NEXT: srli a2, a1, 8 ; RV64IZBKB-NEXT: sb a1, 0(a0) -; RV64IZBKB-NEXT: sb a4, 1(a0) -; RV64IZBKB-NEXT: sb a3, 2(a0) -; RV64IZBKB-NEXT: sb a2, 3(a0) +; RV64IZBKB-NEXT: sb a2, 1(a0) +; RV64IZBKB-NEXT: sb a7, 2(a0) +; RV64IZBKB-NEXT: sb a6, 3(a0) ; RV64IZBKB-NEXT: ret ; ; RV32I-FAST-LABEL: store_i64: @@ -546,25 +545,25 @@ define void @store_large_constant(ptr %x) { ; SLOW-NEXT: li a2, 220 ; SLOW-NEXT: li a3, 186 ; SLOW-NEXT: li a4, 152 +; SLOW-NEXT: li a5, 118 +; SLOW-NEXT: li a6, 84 +; SLOW-NEXT: li a7, 50 ; SLOW-NEXT: sb a4, 4(a0) ; SLOW-NEXT: sb a3, 5(a0) ; SLOW-NEXT: sb a2, 6(a0) ; SLOW-NEXT: sb a1, 7(a0) -; SLOW-NEXT: li a1, 118 -; SLOW-NEXT: li a2, 84 -; SLOW-NEXT: li a3, 50 -; SLOW-NEXT: li a4, 16 -; SLOW-NEXT: sb a4, 0(a0) -; SLOW-NEXT: sb a3, 1(a0) -; SLOW-NEXT: sb a2, 2(a0) -; SLOW-NEXT: sb a1, 3(a0) +; SLOW-NEXT: li a1, 16 +; SLOW-NEXT: sb a1, 0(a0) +; SLOW-NEXT: sb a7, 1(a0) +; SLOW-NEXT: sb a6, 2(a0) +; SLOW-NEXT: sb a5, 3(a0) ; SLOW-NEXT: ret ; ; RV32I-FAST-LABEL: store_large_constant: ; RV32I-FAST: # %bb.0: ; RV32I-FAST-NEXT: lui a1, 1043916 -; RV32I-FAST-NEXT: addi a1, a1, -1384 ; RV32I-FAST-NEXT: lui a2, 484675 +; RV32I-FAST-NEXT: addi a1, a1, -1384 ; RV32I-FAST-NEXT: addi a2, a2, 528 ; RV32I-FAST-NEXT: sw a2, 0(a0) ; RV32I-FAST-NEXT: sw a1, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll index 22c0b798e1468..1517e524a7f78 100644 --- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll @@ -208,10 +208,10 @@ define i64 @in64(i64 %x, i64 %y, i64 %mask) { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: andn a2, a2, a4 ; RV32ZBB-NEXT: and a0, a0, a4 -; RV32ZBB-NEXT: or a0, a0, a2 -; RV32ZBB-NEXT: andn a2, a3, a5 +; RV32ZBB-NEXT: andn a3, a3, a5 ; RV32ZBB-NEXT: and a1, a1, a5 -; RV32ZBB-NEXT: or a1, a1, a2 +; RV32ZBB-NEXT: or a0, a0, a2 +; RV32ZBB-NEXT: or a1, a1, a3 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: in64: diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll index 6530736304837..602df6831452c 100644 --- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll +++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll @@ -11,17 +11,17 @@ define signext i32 @unroll_loop_cse() { ; CHECK-LABEL: unroll_loop_cse: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(x) -; CHECK-NEXT: lw a1, %lo(x)(a0) -; CHECK-NEXT: lui a0, %hi(check) -; CHECK-NEXT: lw a2, %lo(check)(a0) +; CHECK-NEXT: lui a1, %hi(check) +; CHECK-NEXT: lw a2, %lo(x)(a0) +; CHECK-NEXT: lw a1, %lo(check)(a1) ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: bne a1, a2, .LBB0_6 +; CHECK-NEXT: bne a2, a1, .LBB0_6 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a1, %hi(x) ; CHECK-NEXT: addi a1, a1, %lo(x) -; CHECK-NEXT: lw a3, 4(a1) ; CHECK-NEXT: lui a2, %hi(check) ; CHECK-NEXT: addi a2, a2, %lo(check) +; CHECK-NEXT: lw a3, 4(a1) ; CHECK-NEXT: lw a4, 4(a2) ; CHECK-NEXT: bne a3, a4, .LBB0_6 ; CHECK-NEXT: # %bb.2: diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index f83a933c0b5c8..af5121dfe180d 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -137,8 +137,8 @@ define i32 @combine_urem_udiv(i32 %x) nounwind { ; RV32IM-NEXT: sub a2, a0, a1 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: srli a1, a1, 6 ; RV32IM-NEXT: li a2, 95 +; RV32IM-NEXT: srli a1, a1, 6 ; RV32IM-NEXT: mul a2, a1, a2 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: sub a0, a0, a2 @@ -177,8 +177,8 @@ define i32 @combine_urem_udiv(i32 %x) nounwind { ; RV64IM-NEXT: subw a2, a0, a1 ; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: li a2, 95 +; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: mul a2, a1, a2 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: subw a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index b887036372f7b..c73a18c8869d5 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -91,9 +91,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: srli a0, a0, 6 ; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: lui a1, 2341 ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: srli a0, a0, 5 -; RV32-NEXT: lui a1, 2341 ; RV32-NEXT: addi a1, a1, -1755 ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -111,9 +111,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 38 ; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: lui a1, 2341 ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 37 -; RV64-NEXT: lui a1, 2341 ; RV64-NEXT: addiw a1, a1, -1755 ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -129,9 +129,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV32M-NEXT: slli a0, a0, 5 ; RV32M-NEXT: srli a0, a0, 6 ; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: lui a1, 2341 ; RV32M-NEXT: slli a0, a0, 5 ; RV32M-NEXT: srli a0, a0, 5 -; RV32M-NEXT: lui a1, 2341 ; RV32M-NEXT: addi a1, a1, -1755 ; RV32M-NEXT: sltu a0, a0, a1 ; RV32M-NEXT: ret @@ -145,9 +145,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV64M-NEXT: slli a0, a0, 37 ; RV64M-NEXT: srli a0, a0, 38 ; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: lui a1, 2341 ; RV64M-NEXT: slli a0, a0, 37 ; RV64M-NEXT: srli a0, a0, 37 -; RV64M-NEXT: lui a1, 2341 ; RV64M-NEXT: addiw a1, a1, -1755 ; RV64M-NEXT: sltu a0, a0, a1 ; RV64M-NEXT: ret @@ -161,9 +161,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV32MV-NEXT: slli a0, a0, 5 ; RV32MV-NEXT: srli a0, a0, 6 ; RV32MV-NEXT: or a0, a0, a1 +; RV32MV-NEXT: lui a1, 2341 ; RV32MV-NEXT: slli a0, a0, 5 ; RV32MV-NEXT: srli a0, a0, 5 -; RV32MV-NEXT: lui a1, 2341 ; RV32MV-NEXT: addi a1, a1, -1755 ; RV32MV-NEXT: sltu a0, a0, a1 ; RV32MV-NEXT: ret @@ -177,9 +177,9 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV64MV-NEXT: slli a0, a0, 37 ; RV64MV-NEXT: srli a0, a0, 38 ; RV64MV-NEXT: or a0, a0, a1 +; RV64MV-NEXT: lui a1, 2341 ; RV64MV-NEXT: slli a0, a0, 37 ; RV64MV-NEXT: srli a0, a0, 37 -; RV64MV-NEXT: lui a1, 2341 ; RV64MV-NEXT: addiw a1, a1, -1755 ; RV64MV-NEXT: sltu a0, a0, a1 ; RV64MV-NEXT: ret @@ -357,15 +357,15 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __mulsi3 ; RV32-NEXT: addi a0, a0, -1463 -; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: sltiu a0, a0, 293 ; RV32-NEXT: addi s3, s3, -1 ; RV32-NEXT: addi s1, s1, -1 -; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: andi a1, s3, 2047 +; RV32-NEXT: slli s1, s1, 22 +; RV32-NEXT: sltiu a0, a0, 293 +; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: slli a0, a0, 11 -; RV32-NEXT: slli s1, s1, 22 ; RV32-NEXT: or a0, a0, s1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: sw a0, 0(s0) @@ -413,14 +413,14 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: mv a0, s1 ; RV64-NEXT: call __muldi3 ; RV64-NEXT: addi a0, a0, -1638 -; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: sltiu a0, a0, 2 ; RV64-NEXT: addi s3, s3, -1 -; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: addi s2, s2, -1 +; RV64-NEXT: andi a0, a0, 2047 ; RV64-NEXT: andi a1, s3, 2047 ; RV64-NEXT: andi a2, s2, 2047 +; RV64-NEXT: sltiu a0, a0, 2 ; RV64-NEXT: slli a2, a2, 11 +; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: slli a0, a0, 22 ; RV64-NEXT: or a0, a2, a0 ; RV64-NEXT: or a0, a1, a0 @@ -440,39 +440,39 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32M: # %bb.0: ; RV32M-NEXT: lbu a1, 4(a0) ; RV32M-NEXT: lw a2, 0(a0) -; RV32M-NEXT: slli a1, a1, 10 -; RV32M-NEXT: srli a3, a2, 22 -; RV32M-NEXT: or a1, a3, a1 -; RV32M-NEXT: srli a3, a2, 11 -; RV32M-NEXT: andi a2, a2, 2047 -; RV32M-NEXT: li a4, 683 -; RV32M-NEXT: mul a2, a2, a4 -; RV32M-NEXT: slli a4, a2, 10 -; RV32M-NEXT: slli a2, a2, 21 -; RV32M-NEXT: srli a2, a2, 22 -; RV32M-NEXT: or a2, a2, a4 -; RV32M-NEXT: andi a2, a2, 2047 -; RV32M-NEXT: sltiu a2, a2, 342 +; RV32M-NEXT: li a3, 683 ; RV32M-NEXT: li a4, 819 +; RV32M-NEXT: slli a1, a1, 10 +; RV32M-NEXT: srli a5, a2, 22 +; RV32M-NEXT: or a1, a5, a1 +; RV32M-NEXT: andi a5, a2, 2047 +; RV32M-NEXT: mul a3, a5, a3 +; RV32M-NEXT: li a5, 1463 +; RV32M-NEXT: srli a2, a2, 11 +; RV32M-NEXT: mul a2, a2, a5 +; RV32M-NEXT: slli a5, a3, 10 +; RV32M-NEXT: slli a3, a3, 21 ; RV32M-NEXT: mul a1, a1, a4 +; RV32M-NEXT: addi a2, a2, -1463 +; RV32M-NEXT: srli a3, a3, 22 ; RV32M-NEXT: addi a1, a1, -1638 +; RV32M-NEXT: andi a2, a2, 2047 +; RV32M-NEXT: or a3, a3, a5 ; RV32M-NEXT: andi a1, a1, 2047 -; RV32M-NEXT: sltiu a1, a1, 2 -; RV32M-NEXT: xori a4, a1, 1 -; RV32M-NEXT: li a5, 1463 -; RV32M-NEXT: mul a3, a3, a5 -; RV32M-NEXT: addi a3, a3, -1463 +; RV32M-NEXT: sltiu a2, a2, 293 ; RV32M-NEXT: andi a3, a3, 2047 -; RV32M-NEXT: sltiu a3, a3, 293 +; RV32M-NEXT: sltiu a1, a1, 2 ; RV32M-NEXT: addi a2, a2, -1 +; RV32M-NEXT: sltiu a3, a3, 342 +; RV32M-NEXT: xori a4, a1, 1 ; RV32M-NEXT: addi a1, a1, -1 -; RV32M-NEXT: addi a3, a3, -1 ; RV32M-NEXT: andi a2, a2, 2047 -; RV32M-NEXT: andi a3, a3, 2047 -; RV32M-NEXT: slli a3, a3, 11 +; RV32M-NEXT: addi a3, a3, -1 +; RV32M-NEXT: slli a2, a2, 11 ; RV32M-NEXT: slli a1, a1, 22 -; RV32M-NEXT: or a1, a3, a1 +; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: or a1, a2, a1 +; RV32M-NEXT: or a1, a3, a1 ; RV32M-NEXT: sw a1, 0(a0) ; RV32M-NEXT: sb a4, 4(a0) ; RV32M-NEXT: ret @@ -481,38 +481,38 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64M: # %bb.0: ; RV64M-NEXT: lbu a1, 4(a0) ; RV64M-NEXT: lwu a2, 0(a0) +; RV64M-NEXT: li a3, 683 +; RV64M-NEXT: li a4, 1463 ; RV64M-NEXT: slli a1, a1, 32 ; RV64M-NEXT: or a1, a2, a1 -; RV64M-NEXT: srli a2, a1, 22 +; RV64M-NEXT: andi a2, a1, 2047 +; RV64M-NEXT: mul a2, a2, a3 ; RV64M-NEXT: srli a3, a1, 11 -; RV64M-NEXT: andi a1, a1, 2047 -; RV64M-NEXT: li a4, 683 -; RV64M-NEXT: mul a1, a1, a4 -; RV64M-NEXT: slli a4, a1, 10 -; RV64M-NEXT: slli a1, a1, 53 -; RV64M-NEXT: srli a1, a1, 54 -; RV64M-NEXT: or a1, a1, a4 -; RV64M-NEXT: andi a1, a1, 2047 -; RV64M-NEXT: sltiu a1, a1, 342 -; RV64M-NEXT: li a4, 1463 ; RV64M-NEXT: mul a3, a3, a4 +; RV64M-NEXT: li a4, 819 +; RV64M-NEXT: srli a1, a1, 22 +; RV64M-NEXT: mul a1, a1, a4 +; RV64M-NEXT: slli a4, a2, 10 +; RV64M-NEXT: slli a2, a2, 53 ; RV64M-NEXT: addi a3, a3, -1463 +; RV64M-NEXT: addi a1, a1, -1638 +; RV64M-NEXT: srli a2, a2, 54 ; RV64M-NEXT: andi a3, a3, 2047 +; RV64M-NEXT: andi a1, a1, 2047 +; RV64M-NEXT: or a2, a2, a4 ; RV64M-NEXT: sltiu a3, a3, 293 -; RV64M-NEXT: li a4, 819 -; RV64M-NEXT: mul a2, a2, a4 -; RV64M-NEXT: addi a2, a2, -1638 +; RV64M-NEXT: sltiu a1, a1, 2 ; RV64M-NEXT: andi a2, a2, 2047 -; RV64M-NEXT: sltiu a2, a2, 2 ; RV64M-NEXT: addi a1, a1, -1 -; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: addi a3, a3, -1 -; RV64M-NEXT: andi a1, a1, 2047 +; RV64M-NEXT: sltiu a2, a2, 342 ; RV64M-NEXT: andi a3, a3, 2047 +; RV64M-NEXT: slli a1, a1, 22 +; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: slli a3, a3, 11 -; RV64M-NEXT: slli a2, a2, 22 -; RV64M-NEXT: or a2, a3, a2 -; RV64M-NEXT: or a1, a1, a2 +; RV64M-NEXT: andi a2, a2, 2047 +; RV64M-NEXT: or a1, a3, a1 +; RV64M-NEXT: or a1, a2, a1 ; RV64M-NEXT: slli a2, a1, 31 ; RV64M-NEXT: srli a2, a2, 63 ; RV64M-NEXT: sw a1, 0(a0) @@ -523,58 +523,58 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV: # %bb.0: ; RV32MV-NEXT: lw a1, 0(a0) ; RV32MV-NEXT: lbu a2, 4(a0) -; RV32MV-NEXT: andi a3, a1, 2047 ; RV32MV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32MV-NEXT: vmv.v.x v8, a3 -; RV32MV-NEXT: slli a3, a1, 10 -; RV32MV-NEXT: srli a3, a3, 21 -; RV32MV-NEXT: vslide1down.vx v8, v8, a3 +; RV32MV-NEXT: vid.v v8 +; RV32MV-NEXT: lui a3, %hi(.LCPI4_0) +; RV32MV-NEXT: addi a3, a3, %lo(.LCPI4_0) +; RV32MV-NEXT: vle16.v v9, (a3) +; RV32MV-NEXT: andi a3, a1, 2047 ; RV32MV-NEXT: slli a2, a2, 10 -; RV32MV-NEXT: srli a1, a1, 22 -; RV32MV-NEXT: or a1, a1, a2 -; RV32MV-NEXT: andi a1, a1, 2047 -; RV32MV-NEXT: vslide1down.vx v8, v8, a1 -; RV32MV-NEXT: lui a1, %hi(.LCPI4_0) -; RV32MV-NEXT: addi a1, a1, %lo(.LCPI4_0) -; RV32MV-NEXT: vle16.v v9, (a1) -; RV32MV-NEXT: vslidedown.vi v8, v8, 1 -; RV32MV-NEXT: vid.v v10 -; RV32MV-NEXT: vsub.vv v8, v8, v10 -; RV32MV-NEXT: vmul.vv v8, v8, v9 -; RV32MV-NEXT: vadd.vv v9, v8, v8 -; RV32MV-NEXT: lui a1, 41121 -; RV32MV-NEXT: addi a1, a1, -1527 -; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32MV-NEXT: vmv.s.x v10, a1 -; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV32MV-NEXT: vsext.vf2 v11, v10 -; RV32MV-NEXT: vsll.vv v9, v9, v11 +; RV32MV-NEXT: vmv.v.x v10, a3 +; RV32MV-NEXT: srli a3, a1, 22 +; RV32MV-NEXT: or a2, a3, a2 +; RV32MV-NEXT: lui a3, 41121 +; RV32MV-NEXT: slli a1, a1, 10 +; RV32MV-NEXT: srli a1, a1, 21 +; RV32MV-NEXT: vslide1down.vx v10, v10, a1 ; RV32MV-NEXT: li a1, 2047 -; RV32MV-NEXT: vand.vx v8, v8, a1 ; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32MV-NEXT: vmv.v.i v10, 1 +; RV32MV-NEXT: vmv.v.i v11, 1 +; RV32MV-NEXT: andi a2, a2, 2047 ; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV32MV-NEXT: vsext.vf2 v11, v10 +; RV32MV-NEXT: vslide1down.vx v10, v10, a2 ; RV32MV-NEXT: lui a2, %hi(.LCPI4_1) ; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1) +; RV32MV-NEXT: addi a3, a3, -1527 +; RV32MV-NEXT: vsext.vf2 v12, v11 +; RV32MV-NEXT: vslidedown.vi v10, v10, 1 +; RV32MV-NEXT: vsub.vv v8, v10, v8 +; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32MV-NEXT: vmv.s.x v10, a3 +; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV32MV-NEXT: vsext.vf2 v11, v10 +; RV32MV-NEXT: vmul.vv v8, v8, v9 +; RV32MV-NEXT: vadd.vv v9, v8, v8 +; RV32MV-NEXT: vsll.vv v9, v9, v11 ; RV32MV-NEXT: vle16.v v10, (a2) -; RV32MV-NEXT: vsrl.vv v8, v8, v11 +; RV32MV-NEXT: vand.vx v8, v8, a1 +; RV32MV-NEXT: vsrl.vv v8, v8, v12 ; RV32MV-NEXT: vor.vv v8, v8, v9 ; RV32MV-NEXT: vand.vx v8, v8, a1 ; RV32MV-NEXT: vmsltu.vv v0, v10, v8 ; RV32MV-NEXT: vmv.v.i v8, 0 ; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vslidedown.vi v9, v8, 2 -; RV32MV-NEXT: vmv.x.s a1, v9 -; RV32MV-NEXT: slli a2, a1, 21 -; RV32MV-NEXT: srli a2, a2, 31 -; RV32MV-NEXT: vmv.x.s a3, v8 -; RV32MV-NEXT: andi a3, a3, 2047 +; RV32MV-NEXT: vmv.x.s a1, v8 ; RV32MV-NEXT: vslidedown.vi v8, v8, 1 -; RV32MV-NEXT: slli a1, a1, 22 -; RV32MV-NEXT: or a1, a3, a1 +; RV32MV-NEXT: vmv.x.s a2, v9 +; RV32MV-NEXT: andi a1, a1, 2047 +; RV32MV-NEXT: slli a3, a2, 22 +; RV32MV-NEXT: or a1, a1, a3 ; RV32MV-NEXT: vmv.x.s a3, v8 +; RV32MV-NEXT: slli a2, a2, 21 ; RV32MV-NEXT: andi a3, a3, 2047 +; RV32MV-NEXT: srli a2, a2, 31 ; RV32MV-NEXT: slli a3, a3, 11 ; RV32MV-NEXT: or a1, a1, a3 ; RV32MV-NEXT: sw a1, 0(a0) @@ -583,57 +583,57 @@ define void @test_urem_vec(ptr %X) nounwind { ; ; RV64MV-LABEL: test_urem_vec: ; RV64MV: # %bb.0: -; RV64MV-NEXT: lbu a1, 4(a0) -; RV64MV-NEXT: lwu a2, 0(a0) -; RV64MV-NEXT: slli a1, a1, 32 -; RV64MV-NEXT: or a1, a2, a1 +; RV64MV-NEXT: lwu a1, 0(a0) +; RV64MV-NEXT: lbu a2, 4(a0) +; RV64MV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64MV-NEXT: vid.v v8 +; RV64MV-NEXT: lui a3, %hi(.LCPI4_0) +; RV64MV-NEXT: addi a3, a3, %lo(.LCPI4_0) +; RV64MV-NEXT: vle16.v v9, (a3) +; RV64MV-NEXT: lui a3, 41121 +; RV64MV-NEXT: slli a2, a2, 32 +; RV64MV-NEXT: or a1, a1, a2 +; RV64MV-NEXT: andi a2, a1, 2047 +; RV64MV-NEXT: vmv.v.x v10, a2 ; RV64MV-NEXT: slli a2, a1, 42 ; RV64MV-NEXT: srli a2, a2, 53 -; RV64MV-NEXT: andi a3, a1, 2047 -; RV64MV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64MV-NEXT: vmv.v.x v8, a3 -; RV64MV-NEXT: vslide1down.vx v8, v8, a2 -; RV64MV-NEXT: srli a1, a1, 22 -; RV64MV-NEXT: vslide1down.vx v8, v8, a1 -; RV64MV-NEXT: lui a1, %hi(.LCPI4_0) -; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_0) -; RV64MV-NEXT: vle16.v v9, (a1) -; RV64MV-NEXT: vslidedown.vi v8, v8, 1 -; RV64MV-NEXT: vid.v v10 -; RV64MV-NEXT: vsub.vv v8, v8, v10 -; RV64MV-NEXT: vmul.vv v8, v8, v9 -; RV64MV-NEXT: vadd.vv v9, v8, v8 -; RV64MV-NEXT: lui a1, 41121 -; RV64MV-NEXT: addi a1, a1, -1527 +; RV64MV-NEXT: vslide1down.vx v10, v10, a2 +; RV64MV-NEXT: li a2, 2047 ; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64MV-NEXT: vmv.s.x v10, a1 +; RV64MV-NEXT: vmv.v.i v11, 1 +; RV64MV-NEXT: srli a1, a1, 22 ; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64MV-NEXT: vsext.vf2 v11, v10 -; RV64MV-NEXT: vsll.vv v9, v9, v11 -; RV64MV-NEXT: li a1, 2047 -; RV64MV-NEXT: vand.vx v8, v8, a1 +; RV64MV-NEXT: vslide1down.vx v10, v10, a1 +; RV64MV-NEXT: lui a1, %hi(.LCPI4_1) +; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_1) +; RV64MV-NEXT: addi a3, a3, -1527 +; RV64MV-NEXT: vsext.vf2 v12, v11 +; RV64MV-NEXT: vslidedown.vi v10, v10, 1 +; RV64MV-NEXT: vsub.vv v8, v10, v8 ; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64MV-NEXT: vmv.v.i v10, 1 +; RV64MV-NEXT: vmv.s.x v10, a3 ; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64MV-NEXT: vsext.vf2 v11, v10 -; RV64MV-NEXT: lui a2, %hi(.LCPI4_1) -; RV64MV-NEXT: addi a2, a2, %lo(.LCPI4_1) -; RV64MV-NEXT: vle16.v v10, (a2) -; RV64MV-NEXT: vsrl.vv v8, v8, v11 +; RV64MV-NEXT: vmul.vv v8, v8, v9 +; RV64MV-NEXT: vadd.vv v9, v8, v8 +; RV64MV-NEXT: vsll.vv v9, v9, v11 +; RV64MV-NEXT: vle16.v v10, (a1) +; RV64MV-NEXT: vand.vx v8, v8, a2 +; RV64MV-NEXT: vsrl.vv v8, v8, v12 ; RV64MV-NEXT: vor.vv v8, v8, v9 -; RV64MV-NEXT: vand.vx v8, v8, a1 +; RV64MV-NEXT: vand.vx v8, v8, a2 ; RV64MV-NEXT: vmsltu.vv v0, v10, v8 ; RV64MV-NEXT: vmv.v.i v8, 0 ; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64MV-NEXT: vmv.x.s a1, v8 -; RV64MV-NEXT: andi a1, a1, 2047 ; RV64MV-NEXT: vslidedown.vi v9, v8, 1 -; RV64MV-NEXT: vmv.x.s a2, v9 -; RV64MV-NEXT: andi a2, a2, 2047 -; RV64MV-NEXT: slli a2, a2, 11 ; RV64MV-NEXT: vslidedown.vi v8, v8, 2 +; RV64MV-NEXT: andi a1, a1, 2047 +; RV64MV-NEXT: vmv.x.s a2, v9 ; RV64MV-NEXT: vmv.x.s a3, v8 +; RV64MV-NEXT: andi a2, a2, 2047 ; RV64MV-NEXT: slli a3, a3, 22 +; RV64MV-NEXT: slli a2, a2, 11 ; RV64MV-NEXT: or a1, a1, a3 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: slli a2, a1, 31 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 01f06474f78c2..988856ca70923 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -59,30 +59,30 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lhu a4, 8(a1) ; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 8456 +; RV32IM-NEXT: lui a6, 11038 +; RV32IM-NEXT: li a7, 95 +; RV32IM-NEXT: lui t0, 10700 +; RV32IM-NEXT: li t1, 98 +; RV32IM-NEXT: addi a6, a6, -1465 +; RV32IM-NEXT: mulhu a6, a2, a6 +; RV32IM-NEXT: mul a6, a6, a7 +; RV32IM-NEXT: lui a7, 1045 +; RV32IM-NEXT: addi t0, t0, -1003 +; RV32IM-NEXT: mulhu t0, a4, t0 +; RV32IM-NEXT: mul t0, t0, t1 +; RV32IM-NEXT: li t1, 1003 ; RV32IM-NEXT: addi a5, a5, 1058 +; RV32IM-NEXT: addi a7, a7, 1801 ; RV32IM-NEXT: mulhu a5, a3, a5 -; RV32IM-NEXT: slli a6, a5, 7 +; RV32IM-NEXT: mulhu a7, a1, a7 +; RV32IM-NEXT: mul a7, a7, t1 +; RV32IM-NEXT: slli t1, a5, 7 ; RV32IM-NEXT: slli a5, a5, 2 -; RV32IM-NEXT: sub a5, a5, a6 +; RV32IM-NEXT: sub a5, a5, t1 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: sub a4, a4, t0 +; RV32IM-NEXT: sub a1, a1, a7 ; RV32IM-NEXT: add a3, a3, a5 -; RV32IM-NEXT: lui a5, 11038 -; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a5, a2, a5 -; RV32IM-NEXT: li a6, 95 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: lui a5, 10700 -; RV32IM-NEXT: addi a5, a5, -1003 -; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: li a6, 98 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: lui a5, 1045 -; RV32IM-NEXT: addi a5, a5, 1801 -; RV32IM-NEXT: mulhu a5, a1, a5 -; RV32IM-NEXT: li a6, 1003 -; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a1, a1, a5 ; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: sh a3, 2(a0) ; RV32IM-NEXT: sh a4, 4(a0) @@ -133,38 +133,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI0_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64IM-NEXT: lhu a2, 0(a1) ; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a4, 0(a1) -; RV64IM-NEXT: lhu a5, 16(a1) +; RV64IM-NEXT: lhu a4, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: mulhu a2, a3, a2 -; RV64IM-NEXT: slli a6, a2, 7 -; RV64IM-NEXT: lui a7, %hi(.LCPI0_1) -; RV64IM-NEXT: ld a7, %lo(.LCPI0_1)(a7) -; RV64IM-NEXT: slli a2, a2, 2 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: mulhu a3, a4, a7 -; RV64IM-NEXT: lui a6, %hi(.LCPI0_2) -; RV64IM-NEXT: ld a6, %lo(.LCPI0_2)(a6) +; RV64IM-NEXT: lui a5, %hi(.LCPI0_0) +; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) ; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: mulhu a3, a5, a6 -; RV64IM-NEXT: lui a6, %hi(.LCPI0_3) -; RV64IM-NEXT: ld a6, %lo(.LCPI0_3)(a6) -; RV64IM-NEXT: li a7, 98 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: mulhu a3, a1, a6 -; RV64IM-NEXT: li a6, 1003 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: sh a4, 0(a0) -; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a5, 4(a0) +; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) +; RV64IM-NEXT: lui t0, %hi(.LCPI0_2) +; RV64IM-NEXT: li t1, 98 +; RV64IM-NEXT: ld t0, %lo(.LCPI0_2)(t0) +; RV64IM-NEXT: mulhu a6, a2, a6 +; RV64IM-NEXT: mul a6, a6, a7 +; RV64IM-NEXT: lui a7, %hi(.LCPI0_3) +; RV64IM-NEXT: ld a5, %lo(.LCPI0_0)(a5) +; RV64IM-NEXT: ld a7, %lo(.LCPI0_3)(a7) +; RV64IM-NEXT: mulhu t0, a4, t0 +; RV64IM-NEXT: mul t0, t0, t1 +; RV64IM-NEXT: li t1, 1003 +; RV64IM-NEXT: mulhu a5, a3, a5 +; RV64IM-NEXT: mulhu a7, a1, a7 +; RV64IM-NEXT: mul a7, a7, t1 +; RV64IM-NEXT: slli t1, a5, 7 +; RV64IM-NEXT: slli a5, a5, 2 +; RV64IM-NEXT: subw a5, a5, t1 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: subw a4, a4, t0 +; RV64IM-NEXT: subw a1, a1, a7 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, @@ -221,19 +221,19 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lhu a4, 8(a1) ; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a6, a2, a5 -; RV32IM-NEXT: li a7, 95 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a2, a2, a6 -; RV32IM-NEXT: mulhu a6, a3, a5 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a3, a3, a6 -; RV32IM-NEXT: mulhu a6, a4, a5 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a4, a4, a6 +; RV32IM-NEXT: mulhu a7, a2, a5 +; RV32IM-NEXT: mulhu t0, a3, a5 +; RV32IM-NEXT: mulhu t1, a4, a5 ; RV32IM-NEXT: mulhu a5, a1, a5 -; RV32IM-NEXT: mul a5, a5, a7 +; RV32IM-NEXT: mul a7, a7, a6 +; RV32IM-NEXT: mul t0, t0, a6 +; RV32IM-NEXT: mul t1, t1, a6 +; RV32IM-NEXT: mul a5, a5, a6 +; RV32IM-NEXT: sub a2, a2, a7 +; RV32IM-NEXT: sub a3, a3, t0 +; RV32IM-NEXT: sub a4, a4, t1 ; RV32IM-NEXT: sub a1, a1, a5 ; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: sh a3, 2(a0) @@ -291,18 +291,18 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lhu a4, 8(a1) ; RV64IM-NEXT: lhu a5, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: mulhu a6, a3, a2 -; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: mulhu a6, a4, a2 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a4, a4, a6 -; RV64IM-NEXT: mulhu a6, a5, a2 -; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a5, a5, a6 +; RV64IM-NEXT: li a6, 95 +; RV64IM-NEXT: mulhu a7, a3, a2 +; RV64IM-NEXT: mulhu t0, a4, a2 +; RV64IM-NEXT: mulhu t1, a5, a2 ; RV64IM-NEXT: mulhu a2, a1, a2 -; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: mul a7, a7, a6 +; RV64IM-NEXT: mul t0, t0, a6 +; RV64IM-NEXT: mul t1, t1, a6 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, a7 +; RV64IM-NEXT: subw a4, a4, t0 +; RV64IM-NEXT: subw a5, a5, t1 ; RV64IM-NEXT: subw a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) @@ -388,33 +388,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 12(a1) -; RV32IM-NEXT: lhu a3, 0(a1) -; RV32IM-NEXT: lhu a4, 4(a1) -; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: lhu a2, 0(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 8(a1) +; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a6, a2, a5 -; RV32IM-NEXT: li a7, 95 -; RV32IM-NEXT: mul t0, a6, a7 -; RV32IM-NEXT: mulhu t1, a1, a5 -; RV32IM-NEXT: mul t2, t1, a7 -; RV32IM-NEXT: mulhu t3, a4, a5 -; RV32IM-NEXT: mul t4, t3, a7 -; RV32IM-NEXT: mulhu a5, a3, a5 -; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: add a3, a3, a5 -; RV32IM-NEXT: sub a3, a3, a7 -; RV32IM-NEXT: add a4, a4, t3 -; RV32IM-NEXT: sub a4, a4, t4 -; RV32IM-NEXT: add a1, a1, t1 +; RV32IM-NEXT: mulhu a7, a1, a5 +; RV32IM-NEXT: mulhu t0, a4, a5 +; RV32IM-NEXT: mulhu t1, a3, a5 +; RV32IM-NEXT: mulhu a5, a2, a5 +; RV32IM-NEXT: mul t2, a7, a6 +; RV32IM-NEXT: mul t3, t0, a6 +; RV32IM-NEXT: mul t4, t1, a6 +; RV32IM-NEXT: mul a6, a5, a6 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: add a3, a3, t1 +; RV32IM-NEXT: add a4, a4, t0 +; RV32IM-NEXT: add a1, a1, a7 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: sub a4, a4, t3 ; RV32IM-NEXT: sub a1, a1, t2 -; RV32IM-NEXT: add a2, a2, a6 -; RV32IM-NEXT: sub a2, a2, t0 -; RV32IM-NEXT: sh a3, 0(a0) -; RV32IM-NEXT: sh a4, 2(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_urem_udiv: @@ -489,33 +489,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 24(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI2_0)(a3) -; RV64IM-NEXT: lhu a4, 0(a1) -; RV64IM-NEXT: lhu a5, 8(a1) -; RV64IM-NEXT: lhu a1, 16(a1) -; RV64IM-NEXT: mulhu a6, a2, a3 -; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul t0, a6, a7 -; RV64IM-NEXT: mulhu t1, a1, a3 -; RV64IM-NEXT: mul t2, t1, a7 -; RV64IM-NEXT: mulhu t3, a5, a3 -; RV64IM-NEXT: mul t4, t3, a7 -; RV64IM-NEXT: mulhu a3, a4, a3 -; RV64IM-NEXT: mul a7, a3, a7 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: subw a3, a3, a7 -; RV64IM-NEXT: add a5, a5, t3 -; RV64IM-NEXT: subw a4, a5, t4 +; RV64IM-NEXT: lhu a2, 16(a1) +; RV64IM-NEXT: lhu a3, 24(a1) +; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) +; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) +; RV64IM-NEXT: lhu a5, 0(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: li a6, 95 +; RV64IM-NEXT: mulhu a7, a3, a4 +; RV64IM-NEXT: mulhu t0, a2, a4 +; RV64IM-NEXT: mulhu t1, a1, a4 +; RV64IM-NEXT: mulhu a4, a5, a4 +; RV64IM-NEXT: mul t2, a7, a6 +; RV64IM-NEXT: mul t3, t0, a6 +; RV64IM-NEXT: mul t4, t1, a6 +; RV64IM-NEXT: mul a6, a4, a6 +; RV64IM-NEXT: add a4, a5, a4 ; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: subw a1, a1, t2 -; RV64IM-NEXT: add a2, a2, a6 -; RV64IM-NEXT: subw a2, a2, t0 -; RV64IM-NEXT: sh a3, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: add a2, a2, t0 +; RV64IM-NEXT: add a3, a3, a7 +; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: subw a1, a1, t4 +; RV64IM-NEXT: subw a2, a2, t3 +; RV64IM-NEXT: subw a3, a3, t2 +; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -558,23 +558,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_urem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 4(a1) -; RV32IM-NEXT: lhu a3, 8(a1) -; RV32IM-NEXT: lhu a4, 12(a1) -; RV32IM-NEXT: lhu a1, 0(a1) +; RV32IM-NEXT: lhu a2, 0(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 8(a1) +; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 11038 -; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a5, a4, a5 ; RV32IM-NEXT: li a6, 95 +; RV32IM-NEXT: addi a5, a5, -1465 +; RV32IM-NEXT: mulhu a5, a1, a5 +; RV32IM-NEXT: andi a2, a2, 63 +; RV32IM-NEXT: andi a3, a3, 31 +; RV32IM-NEXT: andi a4, a4, 7 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: andi a1, a1, 63 -; RV32IM-NEXT: andi a2, a2, 31 -; RV32IM-NEXT: andi a3, a3, 7 -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a2, 2(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_power_of_two: @@ -610,23 +610,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_power_of_two: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 24(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI3_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI3_0)(a3) +; RV64IM-NEXT: lhu a2, 0(a1) +; RV64IM-NEXT: lhu a3, 8(a1) ; RV64IM-NEXT: lhu a4, 16(a1) -; RV64IM-NEXT: lhu a5, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) -; RV64IM-NEXT: mulhu a3, a2, a3 +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, %hi(.LCPI3_0) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: andi a1, a1, 63 -; RV64IM-NEXT: andi a5, a5, 31 +; RV64IM-NEXT: ld a5, %lo(.LCPI3_0)(a5) +; RV64IM-NEXT: andi a2, a2, 63 +; RV64IM-NEXT: andi a3, a3, 31 ; RV64IM-NEXT: andi a4, a4, 7 -; RV64IM-NEXT: sh a1, 0(a0) -; RV64IM-NEXT: sh a5, 2(a0) +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: mul a5, a5, a6 +; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) -; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -676,24 +676,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a4, 1603 +; RV32IM-NEXT: li a5, 654 +; RV32IM-NEXT: lui a6, 45590 +; RV32IM-NEXT: li a7, 23 ; RV32IM-NEXT: addi a4, a4, 1341 ; RV32IM-NEXT: mulhu a4, a2, a4 -; RV32IM-NEXT: li a5, 654 ; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: lui a5, 193 +; RV32IM-NEXT: addi a6, a6, 1069 +; RV32IM-NEXT: mulhu a6, a3, a6 +; RV32IM-NEXT: mul a6, a6, a7 +; RV32IM-NEXT: lui a7, 1 +; RV32IM-NEXT: addi a5, a5, 1464 +; RV32IM-NEXT: addi a7, a7, 1327 +; RV32IM-NEXT: mulhu a5, a1, a5 +; RV32IM-NEXT: mul a5, a5, a7 ; RV32IM-NEXT: sub a2, a2, a4 -; RV32IM-NEXT: lui a4, 45590 -; RV32IM-NEXT: addi a4, a4, 1069 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: li a5, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 193 -; RV32IM-NEXT: addi a4, a4, 1464 -; RV32IM-NEXT: mulhu a4, a1, a4 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sub a3, a3, a6 +; RV32IM-NEXT: sub a1, a1, a5 ; RV32IM-NEXT: sh zero, 0(a0) ; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -738,31 +738,31 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI4_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI4_0)(a2) -; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a2, 8(a1) +; RV64IM-NEXT: lhu a3, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: mulhu a2, a3, a2 -; RV64IM-NEXT: lui a5, %hi(.LCPI4_1) -; RV64IM-NEXT: ld a5, %lo(.LCPI4_1)(a5) -; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a2 -; RV64IM-NEXT: mulhu a2, a4, a5 +; RV64IM-NEXT: lui a4, %hi(.LCPI4_0) +; RV64IM-NEXT: li a5, 654 +; RV64IM-NEXT: ld a4, %lo(.LCPI4_0)(a4) +; RV64IM-NEXT: lui a6, %hi(.LCPI4_1) +; RV64IM-NEXT: li a7, 23 +; RV64IM-NEXT: ld a6, %lo(.LCPI4_1)(a6) +; RV64IM-NEXT: mulhu a4, a2, a4 +; RV64IM-NEXT: mul a4, a4, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) -; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a4, a4, a2 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: mulhu a6, a3, a6 +; RV64IM-NEXT: mul a6, a6, a7 +; RV64IM-NEXT: lui a7, 1 +; RV64IM-NEXT: addi a7, a7, 1327 +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: mul a5, a5, a7 +; RV64IM-NEXT: subw a2, a2, a4 +; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: subw a1, a1, a5 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, @@ -958,37 +958,37 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_i64: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI6_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64IM-NEXT: ld a2, 8(a1) ; RV64IM-NEXT: ld a3, 16(a1) -; RV64IM-NEXT: ld a4, 8(a1) ; RV64IM-NEXT: ld a1, 24(a1) -; RV64IM-NEXT: mulhu a2, a3, a2 -; RV64IM-NEXT: sub a5, a3, a2 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: srli a2, a2, 4 -; RV64IM-NEXT: li a5, 23 -; RV64IM-NEXT: lui a6, %hi(.LCPI6_1) -; RV64IM-NEXT: ld a6, %lo(.LCPI6_1)(a6) -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a3, a3, a2 -; RV64IM-NEXT: srli a2, a4, 1 -; RV64IM-NEXT: mulhu a2, a2, a6 -; RV64IM-NEXT: srli a2, a2, 7 -; RV64IM-NEXT: lui a5, %hi(.LCPI6_2) -; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5) +; RV64IM-NEXT: lui a4, %hi(.LCPI6_1) +; RV64IM-NEXT: ld a4, %lo(.LCPI6_1)(a4) +; RV64IM-NEXT: lui a5, %hi(.LCPI6_0) ; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: sub a4, a4, a2 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: srli a2, a2, 12 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: srli a7, a2, 1 +; RV64IM-NEXT: mulhu a4, a7, a4 +; RV64IM-NEXT: lui a7, %hi(.LCPI6_2) +; RV64IM-NEXT: ld a5, %lo(.LCPI6_0)(a5) +; RV64IM-NEXT: ld a7, %lo(.LCPI6_2)(a7) +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: lui a6, 1 +; RV64IM-NEXT: addiw a6, a6, 1327 +; RV64IM-NEXT: mulhu a5, a3, a5 +; RV64IM-NEXT: mulhu a7, a1, a7 +; RV64IM-NEXT: srli a7, a7, 12 +; RV64IM-NEXT: mul a6, a7, a6 +; RV64IM-NEXT: sub a7, a3, a5 +; RV64IM-NEXT: srli a7, a7, 1 +; RV64IM-NEXT: add a5, a7, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sub a1, a1, a6 +; RV64IM-NEXT: li a4, 23 +; RV64IM-NEXT: srli a5, a5, 4 +; RV64IM-NEXT: mul a4, a5, a4 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sd zero, 0(a0) -; RV64IM-NEXT: sd a4, 8(a0) +; RV64IM-NEXT: sd a2, 8(a0) ; RV64IM-NEXT: sd a3, 16(a0) ; RV64IM-NEXT: sd a1, 24(a0) ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll index c76a53468f768..b09ff9805eb97 100644 --- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll @@ -108,9 +108,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV32I-LABEL: func16: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a3, 16 +; RV32I-NEXT: mul a1, a1, a2 ; RV32I-NEXT: addi a3, a3, -1 ; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: mul a1, a1, a2 ; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: sub a1, a0, a1 ; RV32I-NEXT: sltu a0, a0, a1 @@ -121,9 +121,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; RV64I-LABEL: func16: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a3, 16 +; RV64I-NEXT: mul a1, a1, a2 ; RV64I-NEXT: addiw a3, a3, -1 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: mul a1, a1, a2 ; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: sub a1, a0, a1 ; RV64I-NEXT: sltu a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll b/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll index 281c19dc09712..91999444fa766 100644 --- a/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll +++ b/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll @@ -78,9 +78,9 @@ define void @va_double(i32 %n, ...) { ; ILP32E-NEXT: sw a3, 20(sp) ; ILP32E-NEXT: sw a4, 24(sp) ; ILP32E-NEXT: addi a0, sp, 12 +; ILP32E-NEXT: addi a1, sp, 19 ; ILP32E-NEXT: sw a0, 0(sp) -; ILP32E-NEXT: addi a0, sp, 19 -; ILP32E-NEXT: andi a1, a0, -8 +; ILP32E-NEXT: andi a1, a1, -8 ; ILP32E-NEXT: addi a0, a1, 8 ; ILP32E-NEXT: sw a0, 0(sp) ; ILP32E-NEXT: lw a0, 0(a1) @@ -116,9 +116,9 @@ define void @va_double(i32 %n, ...) { ; ILP32E-WITHFP-NEXT: sw a3, 12(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) ; ILP32E-WITHFP-NEXT: addi a0, s0, 4 +; ILP32E-WITHFP-NEXT: addi a1, s0, 11 ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) -; ILP32E-WITHFP-NEXT: addi a0, s0, 11 -; ILP32E-WITHFP-NEXT: andi a1, a0, -8 +; ILP32E-WITHFP-NEXT: andi a1, a1, -8 ; ILP32E-WITHFP-NEXT: addi a0, a1, 8 ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) ; ILP32E-WITHFP-NEXT: lw a0, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index 318b6973b724e..895d84b38be32 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -841,11 +841,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: sw a3, 28(sp) ; ILP32-ILP32F-FPELIM-NEXT: sw a4, 32(sp) ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 20 +; ILP32-ILP32F-FPELIM-NEXT: addi a1, sp, 27 +; ILP32-ILP32F-FPELIM-NEXT: addi a2, sp, 35 ; ILP32-ILP32F-FPELIM-NEXT: sw a0, 12(sp) -; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 27 -; ILP32-ILP32F-FPELIM-NEXT: andi a1, a0, -8 -; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 35 -; ILP32-ILP32F-FPELIM-NEXT: sw a0, 12(sp) +; ILP32-ILP32F-FPELIM-NEXT: andi a1, a1, -8 +; ILP32-ILP32F-FPELIM-NEXT: sw a2, 12(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a1) ; ILP32-ILP32F-FPELIM-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 48 @@ -865,11 +865,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: sw a3, 12(s0) ; ILP32-ILP32F-WITHFP-NEXT: sw a4, 16(s0) ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a1, s0, 11 +; ILP32-ILP32F-WITHFP-NEXT: addi a2, s0, 19 ; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 11 -; ILP32-ILP32F-WITHFP-NEXT: andi a1, a0, -8 -; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 19 -; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) +; ILP32-ILP32F-WITHFP-NEXT: andi a1, a1, -8 +; ILP32-ILP32F-WITHFP-NEXT: sw a2, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a1) ; ILP32-ILP32F-WITHFP-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -888,11 +888,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 28(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 32(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 20 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, sp, 27 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, sp, 35 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 27 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a1, a0, -8 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 35 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a1, a1, -8 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a2, 12(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a1) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a1, 4(a1) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48 @@ -907,11 +907,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32E-FPELIM-NEXT: sw a3, 16(sp) ; ILP32E-FPELIM-NEXT: sw a4, 20(sp) ; ILP32E-FPELIM-NEXT: addi a0, sp, 8 +; ILP32E-FPELIM-NEXT: addi a1, sp, 15 +; ILP32E-FPELIM-NEXT: addi a2, sp, 23 ; ILP32E-FPELIM-NEXT: sw a0, 0(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 15 -; ILP32E-FPELIM-NEXT: andi a1, a0, -8 -; ILP32E-FPELIM-NEXT: addi a0, sp, 23 -; ILP32E-FPELIM-NEXT: sw a0, 0(sp) +; ILP32E-FPELIM-NEXT: andi a1, a1, -8 +; ILP32E-FPELIM-NEXT: sw a2, 0(sp) ; ILP32E-FPELIM-NEXT: lw a0, 0(a1) ; ILP32E-FPELIM-NEXT: lw a1, 4(a1) ; ILP32E-FPELIM-NEXT: addi sp, sp, 28 @@ -929,11 +929,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32E-WITHFP-NEXT: sw a3, 12(s0) ; ILP32E-WITHFP-NEXT: sw a4, 16(s0) ; ILP32E-WITHFP-NEXT: addi a0, s0, 4 +; ILP32E-WITHFP-NEXT: addi a1, s0, 11 +; ILP32E-WITHFP-NEXT: addi a2, s0, 19 ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) -; ILP32E-WITHFP-NEXT: addi a0, s0, 11 -; ILP32E-WITHFP-NEXT: andi a1, a0, -8 -; ILP32E-WITHFP-NEXT: addi a0, s0, 19 -; ILP32E-WITHFP-NEXT: sw a0, -12(s0) +; ILP32E-WITHFP-NEXT: andi a1, a1, -8 +; ILP32E-WITHFP-NEXT: sw a2, -12(s0) ; ILP32E-WITHFP-NEXT: lw a0, 0(a1) ; ILP32E-WITHFP-NEXT: lw a1, 4(a1) ; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload @@ -1040,9 +1040,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 27 ; ILP32-ILP32F-FPELIM-NEXT: andi a1, a0, -8 ; ILP32-ILP32F-FPELIM-NEXT: addi a0, a1, 4 +; ILP32-ILP32F-FPELIM-NEXT: addi a2, a1, 8 ; ILP32-ILP32F-FPELIM-NEXT: sw a0, 12(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a1) -; ILP32-ILP32F-FPELIM-NEXT: addi a2, a1, 8 ; ILP32-ILP32F-FPELIM-NEXT: sw a2, 12(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 48 @@ -1064,9 +1064,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 11 ; ILP32-ILP32F-WITHFP-NEXT: andi a1, a0, -8 ; ILP32-ILP32F-WITHFP-NEXT: addi a0, a1, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a2, a1, 8 ; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a1) -; ILP32-ILP32F-WITHFP-NEXT: addi a2, a1, 8 ; ILP32-ILP32F-WITHFP-NEXT: sw a2, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1106,9 +1106,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a1, a0, -8 ; ILP32E-FPELIM-NEXT: addi a0, a1, 4 +; ILP32E-FPELIM-NEXT: addi a2, a1, 8 ; ILP32E-FPELIM-NEXT: sw a0, 0(sp) ; ILP32E-FPELIM-NEXT: lw a0, 0(a1) -; ILP32E-FPELIM-NEXT: addi a2, a1, 8 ; ILP32E-FPELIM-NEXT: sw a2, 0(sp) ; ILP32E-FPELIM-NEXT: lw a1, 4(a1) ; ILP32E-FPELIM-NEXT: addi sp, sp, 28 @@ -1128,9 +1128,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; ILP32E-WITHFP-NEXT: addi a0, s0, 11 ; ILP32E-WITHFP-NEXT: andi a1, a0, -8 ; ILP32E-WITHFP-NEXT: addi a0, a1, 4 +; ILP32E-WITHFP-NEXT: addi a2, a1, 8 ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) ; ILP32E-WITHFP-NEXT: lw a0, 0(a1) -; ILP32E-WITHFP-NEXT: addi a2, a1, 8 ; ILP32E-WITHFP-NEXT: sw a2, -12(s0) ; ILP32E-WITHFP-NEXT: lw a1, 4(a1) ; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload @@ -1343,15 +1343,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: sw a5, 20(sp) ; ILP32-ILP32F-FPELIM-NEXT: sw a6, 24(sp) ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 12 +; ILP32-ILP32F-FPELIM-NEXT: addi a3, sp, 19 +; ILP32-ILP32F-FPELIM-NEXT: addi a4, sp, 27 ; ILP32-ILP32F-FPELIM-NEXT: sw a0, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 19 -; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -8 -; ILP32-ILP32F-FPELIM-NEXT: addi a3, sp, 27 -; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a3, 4(a0) -; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a3 -; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0 +; ILP32-ILP32F-FPELIM-NEXT: andi a3, a3, -8 +; ILP32-ILP32F-FPELIM-NEXT: sw a4, 4(sp) +; ILP32-ILP32F-FPELIM-NEXT: lw a0, 4(a3) +; ILP32-ILP32F-FPELIM-NEXT: lw a3, 0(a3) +; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a0 +; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3 ; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1 ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32 @@ -1369,15 +1369,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: sw a5, 12(s0) ; ILP32-ILP32F-WITHFP-NEXT: sw a6, 16(s0) ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a3, s0, 11 +; ILP32-ILP32F-WITHFP-NEXT: addi a4, s0, 19 ; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 11 -; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -8 -; ILP32-ILP32F-WITHFP-NEXT: addi a3, s0, 19 -; ILP32-ILP32F-WITHFP-NEXT: sw a3, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a3, 4(a0) -; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a3 -; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0 +; ILP32-ILP32F-WITHFP-NEXT: andi a3, a3, -8 +; ILP32-ILP32F-WITHFP-NEXT: sw a4, -12(s0) +; ILP32-ILP32F-WITHFP-NEXT: lw a0, 4(a3) +; ILP32-ILP32F-WITHFP-NEXT: lw a3, 0(a3) +; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a0 +; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3 ; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload @@ -1394,15 +1394,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a5, 20(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a6, 24(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 12 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, sp, 19 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a4, sp, 27 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 19 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -8 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, sp, 27 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 4(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a3 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a3, a3, -8 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 4(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 4(a3) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 0(a3) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 32 @@ -1415,15 +1415,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32E-FPELIM-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-NEXT: sw a5, 16(sp) ; ILP32E-FPELIM-NEXT: addi a0, sp, 8 +; ILP32E-FPELIM-NEXT: addi a3, sp, 15 +; ILP32E-FPELIM-NEXT: addi a4, sp, 23 ; ILP32E-FPELIM-NEXT: sw a0, 0(sp) -; ILP32E-FPELIM-NEXT: addi a0, sp, 15 -; ILP32E-FPELIM-NEXT: andi a0, a0, -8 -; ILP32E-FPELIM-NEXT: addi a3, sp, 23 -; ILP32E-FPELIM-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-NEXT: lw a3, 4(a0) -; ILP32E-FPELIM-NEXT: lw a0, 0(a0) -; ILP32E-FPELIM-NEXT: add a2, a2, a3 -; ILP32E-FPELIM-NEXT: add a0, a1, a0 +; ILP32E-FPELIM-NEXT: andi a3, a3, -8 +; ILP32E-FPELIM-NEXT: sw a4, 0(sp) +; ILP32E-FPELIM-NEXT: lw a0, 4(a3) +; ILP32E-FPELIM-NEXT: lw a3, 0(a3) +; ILP32E-FPELIM-NEXT: add a2, a2, a0 +; ILP32E-FPELIM-NEXT: add a0, a1, a3 ; ILP32E-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32E-FPELIM-NEXT: add a1, a2, a1 ; ILP32E-FPELIM-NEXT: addi sp, sp, 20 @@ -1439,15 +1439,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32E-WITHFP-NEXT: sw a4, 8(s0) ; ILP32E-WITHFP-NEXT: sw a5, 12(s0) ; ILP32E-WITHFP-NEXT: addi a0, s0, 4 +; ILP32E-WITHFP-NEXT: addi a3, s0, 11 +; ILP32E-WITHFP-NEXT: addi a4, s0, 19 ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) -; ILP32E-WITHFP-NEXT: addi a0, s0, 11 -; ILP32E-WITHFP-NEXT: andi a0, a0, -8 -; ILP32E-WITHFP-NEXT: addi a3, s0, 19 -; ILP32E-WITHFP-NEXT: sw a3, -12(s0) -; ILP32E-WITHFP-NEXT: lw a3, 4(a0) -; ILP32E-WITHFP-NEXT: lw a0, 0(a0) -; ILP32E-WITHFP-NEXT: add a2, a2, a3 -; ILP32E-WITHFP-NEXT: add a0, a1, a0 +; ILP32E-WITHFP-NEXT: andi a3, a3, -8 +; ILP32E-WITHFP-NEXT: sw a4, -12(s0) +; ILP32E-WITHFP-NEXT: lw a0, 4(a3) +; ILP32E-WITHFP-NEXT: lw a3, 0(a3) +; ILP32E-WITHFP-NEXT: add a2, a2, a0 +; ILP32E-WITHFP-NEXT: add a0, a1, a3 ; ILP32E-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32E-WITHFP-NEXT: add a1, a2, a1 ; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload @@ -1549,9 +1549,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 19 ; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -8 ; ILP32-ILP32F-FPELIM-NEXT: addi a3, a0, 4 +; ILP32-ILP32F-FPELIM-NEXT: addi a4, a0, 8 ; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a3, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: addi a4, a0, 8 ; ILP32-ILP32F-FPELIM-NEXT: sw a4, 4(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a4, 4(a0) ; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3 @@ -1575,9 +1575,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 11 ; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -8 ; ILP32-ILP32F-WITHFP-NEXT: addi a3, a0, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a4, a0, 8 ; ILP32-ILP32F-WITHFP-NEXT: sw a3, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a3, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: addi a4, a0, 8 ; ILP32-ILP32F-WITHFP-NEXT: sw a4, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a4, 4(a0) ; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3 @@ -1621,9 +1621,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ILP32E-FPELIM-NEXT: addi a0, sp, 15 ; ILP32E-FPELIM-NEXT: andi a0, a0, -8 ; ILP32E-FPELIM-NEXT: addi a3, a0, 4 +; ILP32E-FPELIM-NEXT: addi a4, a0, 8 ; ILP32E-FPELIM-NEXT: sw a3, 0(sp) ; ILP32E-FPELIM-NEXT: lw a3, 0(a0) -; ILP32E-FPELIM-NEXT: addi a4, a0, 8 ; ILP32E-FPELIM-NEXT: sw a4, 0(sp) ; ILP32E-FPELIM-NEXT: lw a4, 4(a0) ; ILP32E-FPELIM-NEXT: add a0, a1, a3 @@ -1645,9 +1645,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; ILP32E-WITHFP-NEXT: addi a0, s0, 11 ; ILP32E-WITHFP-NEXT: andi a0, a0, -8 ; ILP32E-WITHFP-NEXT: addi a3, a0, 4 +; ILP32E-WITHFP-NEXT: addi a4, a0, 8 ; ILP32E-WITHFP-NEXT: sw a3, -12(s0) ; ILP32E-WITHFP-NEXT: lw a3, 0(a0) -; ILP32E-WITHFP-NEXT: addi a4, a0, 8 ; ILP32E-WITHFP-NEXT: sw a4, -12(s0) ; ILP32E-WITHFP-NEXT: lw a4, 4(a0) ; ILP32E-WITHFP-NEXT: add a0, a1, a3 @@ -1815,8 +1815,8 @@ define void @va3_caller() nounwind { ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, -16 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; LP64-LP64F-LP64D-FPELIM-NEXT: li a2, 1 -; LP64-LP64F-LP64D-FPELIM-NEXT: slli a2, a2, 62 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a0, 2 +; LP64-LP64F-LP64D-FPELIM-NEXT: slli a2, a2, 62 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a1, 1111 ; LP64-LP64F-LP64D-FPELIM-NEXT: call va3 ; LP64-LP64F-LP64D-FPELIM-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1830,8 +1830,8 @@ define void @va3_caller() nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; LP64-LP64F-LP64D-WITHFP-NEXT: addi s0, sp, 16 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a2, 1 -; LP64-LP64F-LP64D-WITHFP-NEXT: slli a2, a2, 62 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a0, 2 +; LP64-LP64F-LP64D-WITHFP-NEXT: slli a2, a2, 62 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a1, 1111 ; LP64-LP64F-LP64D-WITHFP-NEXT: call va3 ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1844,8 +1844,8 @@ define void @va3_caller() nounwind { ; LP64E-FPELIM-NEXT: addi sp, sp, -8 ; LP64E-FPELIM-NEXT: sd ra, 0(sp) # 8-byte Folded Spill ; LP64E-FPELIM-NEXT: li a2, 1 -; LP64E-FPELIM-NEXT: slli a2, a2, 62 ; LP64E-FPELIM-NEXT: li a0, 2 +; LP64E-FPELIM-NEXT: slli a2, a2, 62 ; LP64E-FPELIM-NEXT: li a1, 1111 ; LP64E-FPELIM-NEXT: call va3 ; LP64E-FPELIM-NEXT: ld ra, 0(sp) # 8-byte Folded Reload @@ -1859,8 +1859,8 @@ define void @va3_caller() nounwind { ; LP64E-WITHFP-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; LP64E-WITHFP-NEXT: addi s0, sp, 16 ; LP64E-WITHFP-NEXT: li a2, 1 -; LP64E-WITHFP-NEXT: slli a2, a2, 62 ; LP64E-WITHFP-NEXT: li a0, 2 +; LP64E-WITHFP-NEXT: slli a2, a2, 62 ; LP64E-WITHFP-NEXT: li a1, 1111 ; LP64E-WITHFP-NEXT: call va3 ; LP64E-WITHFP-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1895,21 +1895,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: addi a0, a0, 3 ; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -4 ; ILP32-ILP32F-FPELIM-NEXT: addi a1, a0, 4 +; ILP32-ILP32F-FPELIM-NEXT: addi a2, a0, 7 ; ILP32-ILP32F-FPELIM-NEXT: sw a1, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a1, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: addi a0, a0, 7 -; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -4 -; ILP32-ILP32F-FPELIM-NEXT: addi a2, a0, 4 -; ILP32-ILP32F-FPELIM-NEXT: sw a2, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a2, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: addi a0, a0, 7 -; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -4 -; ILP32-ILP32F-FPELIM-NEXT: addi a3, a0, 4 -; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp) +; ILP32-ILP32F-FPELIM-NEXT: andi a2, a2, -4 ; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, s0 -; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, a2 -; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0 +; ILP32-ILP32F-FPELIM-NEXT: addi a1, a2, 4 +; ILP32-ILP32F-FPELIM-NEXT: addi a3, a2, 7 +; ILP32-ILP32F-FPELIM-NEXT: sw a1, 4(sp) +; ILP32-ILP32F-FPELIM-NEXT: andi a3, a3, -4 +; ILP32-ILP32F-FPELIM-NEXT: lw a1, 0(a2) +; ILP32-ILP32F-FPELIM-NEXT: addi a2, a3, 4 +; ILP32-ILP32F-FPELIM-NEXT: sw a2, 4(sp) +; ILP32-ILP32F-FPELIM-NEXT: lw a2, 0(a3) +; ILP32-ILP32F-FPELIM-NEXT: add a0, a0, s0 +; ILP32-ILP32F-FPELIM-NEXT: add a0, a0, a1 +; ILP32-ILP32F-FPELIM-NEXT: add a0, a0, a2 ; ILP32-ILP32F-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; ILP32-ILP32F-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 48 @@ -1938,21 +1938,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: addi a0, a0, 3 ; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -4 ; ILP32-ILP32F-WITHFP-NEXT: addi a1, a0, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a2, a0, 7 ; ILP32-ILP32F-WITHFP-NEXT: sw a1, -16(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a1, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: addi a0, a0, 7 -; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -4 -; ILP32-ILP32F-WITHFP-NEXT: addi a2, a0, 4 -; ILP32-ILP32F-WITHFP-NEXT: sw a2, -16(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a2, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: addi a0, a0, 7 -; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -4 -; ILP32-ILP32F-WITHFP-NEXT: addi a3, a0, 4 -; ILP32-ILP32F-WITHFP-NEXT: sw a3, -16(s0) +; ILP32-ILP32F-WITHFP-NEXT: andi a2, a2, -4 ; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, s1 -; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, a2 -; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0 +; ILP32-ILP32F-WITHFP-NEXT: addi a1, a2, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a3, a2, 7 +; ILP32-ILP32F-WITHFP-NEXT: sw a1, -16(s0) +; ILP32-ILP32F-WITHFP-NEXT: andi a3, a3, -4 +; ILP32-ILP32F-WITHFP-NEXT: lw a1, 0(a2) +; ILP32-ILP32F-WITHFP-NEXT: addi a2, a3, 4 +; ILP32-ILP32F-WITHFP-NEXT: sw a2, -16(s0) +; ILP32-ILP32F-WITHFP-NEXT: lw a2, 0(a3) +; ILP32-ILP32F-WITHFP-NEXT: add a0, a0, s1 +; ILP32-ILP32F-WITHFP-NEXT: add a0, a0, a1 +; ILP32-ILP32F-WITHFP-NEXT: add a0, a0, a2 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1980,21 +1980,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, a0, 3 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -4 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, a0, 4 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, a0, 7 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a1, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a1, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, a0, 7 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -4 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, a0, 4 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a2, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a2, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, a0, 7 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -4 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, a0, 4 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a2, a2, -4 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, s0 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, a2 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, a2, 4 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, a2, 7 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a1, 4(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a3, a3, -4 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a1, 0(a2) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, a3, 4 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a2, 4(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a2, 0(a3) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a0, s0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a0, a1 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a0, a2 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48 @@ -2019,21 +2019,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; ILP32E-FPELIM-NEXT: addi a0, a0, 3 ; ILP32E-FPELIM-NEXT: andi a0, a0, -4 ; ILP32E-FPELIM-NEXT: addi a1, a0, 4 +; ILP32E-FPELIM-NEXT: addi a2, a0, 7 ; ILP32E-FPELIM-NEXT: sw a1, 4(sp) -; ILP32E-FPELIM-NEXT: lw a1, 0(a0) -; ILP32E-FPELIM-NEXT: addi a0, a0, 7 -; ILP32E-FPELIM-NEXT: andi a0, a0, -4 -; ILP32E-FPELIM-NEXT: addi a2, a0, 4 -; ILP32E-FPELIM-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-NEXT: lw a2, 0(a0) -; ILP32E-FPELIM-NEXT: addi a0, a0, 7 -; ILP32E-FPELIM-NEXT: andi a0, a0, -4 -; ILP32E-FPELIM-NEXT: addi a3, a0, 4 -; ILP32E-FPELIM-NEXT: sw a3, 4(sp) +; ILP32E-FPELIM-NEXT: andi a2, a2, -4 ; ILP32E-FPELIM-NEXT: lw a0, 0(a0) -; ILP32E-FPELIM-NEXT: add a1, a1, s0 -; ILP32E-FPELIM-NEXT: add a1, a1, a2 -; ILP32E-FPELIM-NEXT: add a0, a1, a0 +; ILP32E-FPELIM-NEXT: addi a1, a2, 4 +; ILP32E-FPELIM-NEXT: addi a3, a2, 7 +; ILP32E-FPELIM-NEXT: sw a1, 4(sp) +; ILP32E-FPELIM-NEXT: andi a3, a3, -4 +; ILP32E-FPELIM-NEXT: lw a1, 0(a2) +; ILP32E-FPELIM-NEXT: addi a2, a3, 4 +; ILP32E-FPELIM-NEXT: sw a2, 4(sp) +; ILP32E-FPELIM-NEXT: lw a2, 0(a3) +; ILP32E-FPELIM-NEXT: add a0, a0, s0 +; ILP32E-FPELIM-NEXT: add a0, a0, a1 +; ILP32E-FPELIM-NEXT: add a0, a0, a2 ; ILP32E-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; ILP32E-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; ILP32E-FPELIM-NEXT: addi sp, sp, 40 @@ -2060,21 +2060,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; ILP32E-WITHFP-NEXT: addi a0, a0, 3 ; ILP32E-WITHFP-NEXT: andi a0, a0, -4 ; ILP32E-WITHFP-NEXT: addi a1, a0, 4 +; ILP32E-WITHFP-NEXT: addi a2, a0, 7 ; ILP32E-WITHFP-NEXT: sw a1, -16(s0) -; ILP32E-WITHFP-NEXT: lw a1, 0(a0) -; ILP32E-WITHFP-NEXT: addi a0, a0, 7 -; ILP32E-WITHFP-NEXT: andi a0, a0, -4 -; ILP32E-WITHFP-NEXT: addi a2, a0, 4 -; ILP32E-WITHFP-NEXT: sw a2, -16(s0) -; ILP32E-WITHFP-NEXT: lw a2, 0(a0) -; ILP32E-WITHFP-NEXT: addi a0, a0, 7 -; ILP32E-WITHFP-NEXT: andi a0, a0, -4 -; ILP32E-WITHFP-NEXT: addi a3, a0, 4 -; ILP32E-WITHFP-NEXT: sw a3, -16(s0) +; ILP32E-WITHFP-NEXT: andi a2, a2, -4 ; ILP32E-WITHFP-NEXT: lw a0, 0(a0) -; ILP32E-WITHFP-NEXT: add a1, a1, s1 -; ILP32E-WITHFP-NEXT: add a1, a1, a2 -; ILP32E-WITHFP-NEXT: add a0, a1, a0 +; ILP32E-WITHFP-NEXT: addi a1, a2, 4 +; ILP32E-WITHFP-NEXT: addi a3, a2, 7 +; ILP32E-WITHFP-NEXT: sw a1, -16(s0) +; ILP32E-WITHFP-NEXT: andi a3, a3, -4 +; ILP32E-WITHFP-NEXT: lw a1, 0(a2) +; ILP32E-WITHFP-NEXT: addi a2, a3, 4 +; ILP32E-WITHFP-NEXT: sw a2, -16(s0) +; ILP32E-WITHFP-NEXT: lw a2, 0(a3) +; ILP32E-WITHFP-NEXT: add a0, a0, s1 +; ILP32E-WITHFP-NEXT: add a0, a0, a1 +; ILP32E-WITHFP-NEXT: add a0, a0, a2 ; ILP32E-WITHFP-NEXT: lw ra, 16(sp) # 4-byte Folded Reload ; ILP32E-WITHFP-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; ILP32E-WITHFP-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -2102,21 +2102,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, a0, 3 ; LP64-LP64F-LP64D-FPELIM-NEXT: andi a0, a0, -4 ; LP64-LP64F-LP64D-FPELIM-NEXT: addi a1, a0, 8 +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a2, a0, 11 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a1, 8(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: ld a1, 0(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, a0, 11 -; LP64-LP64F-LP64D-FPELIM-NEXT: andi a0, a0, -4 -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a2, a0, 8 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a2, 8(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: ld a2, 0(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, a0, 11 -; LP64-LP64F-LP64D-FPELIM-NEXT: andi a0, a0, -4 -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, a0, 8 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 8(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: andi a2, a2, -4 ; LP64-LP64F-LP64D-FPELIM-NEXT: ld a0, 0(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, s0 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, a2 -; LP64-LP64F-LP64D-FPELIM-NEXT: addw a0, a1, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a1, a2, 8 +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, a2, 11 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a1, 8(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: andi a3, a3, -4 +; LP64-LP64F-LP64D-FPELIM-NEXT: ld a1, 0(a2) +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a2, a3, 8 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a2, 8(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: ld a2, 0(a3) +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a0, s0 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a0, a1 +; LP64-LP64F-LP64D-FPELIM-NEXT: addw a0, a0, a2 ; LP64-LP64F-LP64D-FPELIM-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-FPELIM-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, 96 @@ -2145,21 +2145,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, a0, 3 ; LP64-LP64F-LP64D-WITHFP-NEXT: andi a0, a0, -4 ; LP64-LP64F-LP64D-WITHFP-NEXT: addi a1, a0, 8 +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a2, a0, 11 ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, -32(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: ld a1, 0(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, a0, 11 -; LP64-LP64F-LP64D-WITHFP-NEXT: andi a0, a0, -4 -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a2, a0, 8 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a2, -32(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: ld a2, 0(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, a0, 11 -; LP64-LP64F-LP64D-WITHFP-NEXT: andi a0, a0, -4 -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, a0, 8 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, -32(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: andi a2, a2, -4 ; LP64-LP64F-LP64D-WITHFP-NEXT: ld a0, 0(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, s1 -; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, a2 -; LP64-LP64F-LP64D-WITHFP-NEXT: addw a0, a1, a0 +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a1, a2, 8 +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, a2, 11 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, -32(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: andi a3, a3, -4 +; LP64-LP64F-LP64D-WITHFP-NEXT: ld a1, 0(a2) +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a2, a3, 8 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a2, -32(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: ld a2, 0(a3) +; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a0, s1 +; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a0, a1 +; LP64-LP64F-LP64D-WITHFP-NEXT: addw a0, a0, a2 ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -2185,21 +2185,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; LP64E-FPELIM-NEXT: addi a0, a0, 3 ; LP64E-FPELIM-NEXT: andi a0, a0, -4 ; LP64E-FPELIM-NEXT: addi a1, a0, 8 +; LP64E-FPELIM-NEXT: addi a2, a0, 11 ; LP64E-FPELIM-NEXT: sd a1, 8(sp) -; LP64E-FPELIM-NEXT: ld a1, 0(a0) -; LP64E-FPELIM-NEXT: addi a0, a0, 11 -; LP64E-FPELIM-NEXT: andi a0, a0, -4 -; LP64E-FPELIM-NEXT: addi a2, a0, 8 -; LP64E-FPELIM-NEXT: sd a2, 8(sp) -; LP64E-FPELIM-NEXT: ld a2, 0(a0) -; LP64E-FPELIM-NEXT: addi a0, a0, 11 -; LP64E-FPELIM-NEXT: andi a0, a0, -4 -; LP64E-FPELIM-NEXT: addi a3, a0, 8 -; LP64E-FPELIM-NEXT: sd a3, 8(sp) +; LP64E-FPELIM-NEXT: andi a2, a2, -4 ; LP64E-FPELIM-NEXT: ld a0, 0(a0) -; LP64E-FPELIM-NEXT: add a1, a1, s0 -; LP64E-FPELIM-NEXT: add a1, a1, a2 -; LP64E-FPELIM-NEXT: addw a0, a1, a0 +; LP64E-FPELIM-NEXT: addi a1, a2, 8 +; LP64E-FPELIM-NEXT: addi a3, a2, 11 +; LP64E-FPELIM-NEXT: sd a1, 8(sp) +; LP64E-FPELIM-NEXT: andi a3, a3, -4 +; LP64E-FPELIM-NEXT: ld a1, 0(a2) +; LP64E-FPELIM-NEXT: addi a2, a3, 8 +; LP64E-FPELIM-NEXT: sd a2, 8(sp) +; LP64E-FPELIM-NEXT: ld a2, 0(a3) +; LP64E-FPELIM-NEXT: add a0, a0, s0 +; LP64E-FPELIM-NEXT: add a0, a0, a1 +; LP64E-FPELIM-NEXT: addw a0, a0, a2 ; LP64E-FPELIM-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64E-FPELIM-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; LP64E-FPELIM-NEXT: addi sp, sp, 80 @@ -2226,21 +2226,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; LP64E-WITHFP-NEXT: addi a0, a0, 3 ; LP64E-WITHFP-NEXT: andi a0, a0, -4 ; LP64E-WITHFP-NEXT: addi a1, a0, 8 +; LP64E-WITHFP-NEXT: addi a2, a0, 11 ; LP64E-WITHFP-NEXT: sd a1, -32(s0) -; LP64E-WITHFP-NEXT: ld a1, 0(a0) -; LP64E-WITHFP-NEXT: addi a0, a0, 11 -; LP64E-WITHFP-NEXT: andi a0, a0, -4 -; LP64E-WITHFP-NEXT: addi a2, a0, 8 -; LP64E-WITHFP-NEXT: sd a2, -32(s0) -; LP64E-WITHFP-NEXT: ld a2, 0(a0) -; LP64E-WITHFP-NEXT: addi a0, a0, 11 -; LP64E-WITHFP-NEXT: andi a0, a0, -4 -; LP64E-WITHFP-NEXT: addi a3, a0, 8 -; LP64E-WITHFP-NEXT: sd a3, -32(s0) +; LP64E-WITHFP-NEXT: andi a2, a2, -4 ; LP64E-WITHFP-NEXT: ld a0, 0(a0) -; LP64E-WITHFP-NEXT: add a1, a1, s1 -; LP64E-WITHFP-NEXT: add a1, a1, a2 -; LP64E-WITHFP-NEXT: addw a0, a1, a0 +; LP64E-WITHFP-NEXT: addi a1, a2, 8 +; LP64E-WITHFP-NEXT: addi a3, a2, 11 +; LP64E-WITHFP-NEXT: sd a1, -32(s0) +; LP64E-WITHFP-NEXT: andi a3, a3, -4 +; LP64E-WITHFP-NEXT: ld a1, 0(a2) +; LP64E-WITHFP-NEXT: addi a2, a3, 8 +; LP64E-WITHFP-NEXT: sd a2, -32(s0) +; LP64E-WITHFP-NEXT: ld a2, 0(a3) +; LP64E-WITHFP-NEXT: add a0, a0, s1 +; LP64E-WITHFP-NEXT: add a0, a0, a1 +; LP64E-WITHFP-NEXT: addw a0, a0, a2 ; LP64E-WITHFP-NEXT: ld ra, 32(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s0, 24(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s1, 16(sp) # 8-byte Folded Reload @@ -2275,36 +2275,36 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-FPELIM: # %bb.0: ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, -64 ; ILP32-ILP32F-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; ILP32-ILP32F-FPELIM-NEXT: li a0, 17 -; ILP32-ILP32F-FPELIM-NEXT: li a1, 16 -; ILP32-ILP32F-FPELIM-NEXT: sw a1, 20(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a0, 24(sp) -; ILP32-ILP32F-FPELIM-NEXT: li a0, 15 -; ILP32-ILP32F-FPELIM-NEXT: lui a1, 262236 -; ILP32-ILP32F-FPELIM-NEXT: addi a1, a1, 655 -; ILP32-ILP32F-FPELIM-NEXT: lui a2, 377487 -; ILP32-ILP32F-FPELIM-NEXT: addi a2, a2, 1475 -; ILP32-ILP32F-FPELIM-NEXT: li a3, 14 -; ILP32-ILP32F-FPELIM-NEXT: sw a3, 0(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a2, 8(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a1, 12(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a0, 16(sp) -; ILP32-ILP32F-FPELIM-NEXT: lui a0, 262153 -; ILP32-ILP32F-FPELIM-NEXT: addi a5, a0, 491 -; ILP32-ILP32F-FPELIM-NEXT: lui a0, 545260 -; ILP32-ILP32F-FPELIM-NEXT: addi t0, a0, -1967 -; ILP32-ILP32F-FPELIM-NEXT: lui a0, 964690 -; ILP32-ILP32F-FPELIM-NEXT: addi t1, a0, -328 -; ILP32-ILP32F-FPELIM-NEXT: lui a0, 335544 -; ILP32-ILP32F-FPELIM-NEXT: addi t2, a0, 1311 -; ILP32-ILP32F-FPELIM-NEXT: lui a0, 688509 -; ILP32-ILP32F-FPELIM-NEXT: addi a6, a0, -2048 +; ILP32-ILP32F-FPELIM-NEXT: li a4, 17 +; ILP32-ILP32F-FPELIM-NEXT: li a5, 16 +; ILP32-ILP32F-FPELIM-NEXT: li a6, 15 +; ILP32-ILP32F-FPELIM-NEXT: lui a7, 262236 +; ILP32-ILP32F-FPELIM-NEXT: lui t0, 377487 +; ILP32-ILP32F-FPELIM-NEXT: li t1, 14 +; ILP32-ILP32F-FPELIM-NEXT: lui t2, 262153 +; ILP32-ILP32F-FPELIM-NEXT: lui t3, 545260 +; ILP32-ILP32F-FPELIM-NEXT: lui t4, 964690 +; ILP32-ILP32F-FPELIM-NEXT: lui t5, 335544 +; ILP32-ILP32F-FPELIM-NEXT: lui t6, 688509 ; ILP32-ILP32F-FPELIM-NEXT: li a0, 1 ; ILP32-ILP32F-FPELIM-NEXT: li a1, 11 ; ILP32-ILP32F-FPELIM-NEXT: addi a2, sp, 32 ; ILP32-ILP32F-FPELIM-NEXT: li a3, 12 +; ILP32-ILP32F-FPELIM-NEXT: sw a5, 20(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a4, 24(sp) ; ILP32-ILP32F-FPELIM-NEXT: li a4, 13 +; ILP32-ILP32F-FPELIM-NEXT: addi a5, a7, 655 +; ILP32-ILP32F-FPELIM-NEXT: addi a7, t0, 1475 +; ILP32-ILP32F-FPELIM-NEXT: sw t1, 0(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a7, 8(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a5, 12(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a6, 16(sp) ; ILP32-ILP32F-FPELIM-NEXT: li a7, 4 +; ILP32-ILP32F-FPELIM-NEXT: addi a5, t2, 491 +; ILP32-ILP32F-FPELIM-NEXT: addi t0, t3, -1967 +; ILP32-ILP32F-FPELIM-NEXT: addi t1, t4, -328 +; ILP32-ILP32F-FPELIM-NEXT: addi t2, t5, 1311 +; ILP32-ILP32F-FPELIM-NEXT: addi a6, t6, -2048 ; ILP32-ILP32F-FPELIM-NEXT: sw t2, 32(sp) ; ILP32-ILP32F-FPELIM-NEXT: sw t1, 36(sp) ; ILP32-ILP32F-FPELIM-NEXT: sw t0, 40(sp) @@ -2320,36 +2320,36 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; ILP32-ILP32F-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32-ILP32F-WITHFP-NEXT: addi s0, sp, 64 -; ILP32-ILP32F-WITHFP-NEXT: li a0, 17 -; ILP32-ILP32F-WITHFP-NEXT: li a1, 16 -; ILP32-ILP32F-WITHFP-NEXT: sw a1, 20(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a0, 24(sp) -; ILP32-ILP32F-WITHFP-NEXT: li a0, 15 -; ILP32-ILP32F-WITHFP-NEXT: lui a1, 262236 -; ILP32-ILP32F-WITHFP-NEXT: addi a1, a1, 655 -; ILP32-ILP32F-WITHFP-NEXT: lui a2, 377487 -; ILP32-ILP32F-WITHFP-NEXT: addi a2, a2, 1475 -; ILP32-ILP32F-WITHFP-NEXT: li a3, 14 -; ILP32-ILP32F-WITHFP-NEXT: sw a3, 0(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a2, 8(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a1, 12(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a0, 16(sp) -; ILP32-ILP32F-WITHFP-NEXT: lui a0, 262153 -; ILP32-ILP32F-WITHFP-NEXT: addi a5, a0, 491 -; ILP32-ILP32F-WITHFP-NEXT: lui a0, 545260 -; ILP32-ILP32F-WITHFP-NEXT: addi t0, a0, -1967 -; ILP32-ILP32F-WITHFP-NEXT: lui a0, 964690 -; ILP32-ILP32F-WITHFP-NEXT: addi t1, a0, -328 -; ILP32-ILP32F-WITHFP-NEXT: lui a0, 335544 -; ILP32-ILP32F-WITHFP-NEXT: addi t2, a0, 1311 -; ILP32-ILP32F-WITHFP-NEXT: lui a0, 688509 -; ILP32-ILP32F-WITHFP-NEXT: addi a6, a0, -2048 +; ILP32-ILP32F-WITHFP-NEXT: li a4, 17 +; ILP32-ILP32F-WITHFP-NEXT: li a5, 16 +; ILP32-ILP32F-WITHFP-NEXT: li a6, 15 +; ILP32-ILP32F-WITHFP-NEXT: lui a7, 262236 +; ILP32-ILP32F-WITHFP-NEXT: lui t0, 377487 +; ILP32-ILP32F-WITHFP-NEXT: li t1, 14 +; ILP32-ILP32F-WITHFP-NEXT: lui t2, 262153 +; ILP32-ILP32F-WITHFP-NEXT: lui t3, 545260 +; ILP32-ILP32F-WITHFP-NEXT: lui t4, 964690 +; ILP32-ILP32F-WITHFP-NEXT: lui t5, 335544 +; ILP32-ILP32F-WITHFP-NEXT: lui t6, 688509 ; ILP32-ILP32F-WITHFP-NEXT: li a0, 1 ; ILP32-ILP32F-WITHFP-NEXT: li a1, 11 ; ILP32-ILP32F-WITHFP-NEXT: addi a2, s0, -32 ; ILP32-ILP32F-WITHFP-NEXT: li a3, 12 +; ILP32-ILP32F-WITHFP-NEXT: sw a5, 20(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a4, 24(sp) ; ILP32-ILP32F-WITHFP-NEXT: li a4, 13 +; ILP32-ILP32F-WITHFP-NEXT: addi a5, a7, 655 +; ILP32-ILP32F-WITHFP-NEXT: addi a7, t0, 1475 +; ILP32-ILP32F-WITHFP-NEXT: sw t1, 0(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a7, 8(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a5, 12(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a6, 16(sp) ; ILP32-ILP32F-WITHFP-NEXT: li a7, 4 +; ILP32-ILP32F-WITHFP-NEXT: addi a5, t2, 491 +; ILP32-ILP32F-WITHFP-NEXT: addi t0, t3, -1967 +; ILP32-ILP32F-WITHFP-NEXT: addi t1, t4, -328 +; ILP32-ILP32F-WITHFP-NEXT: addi t2, t5, 1311 +; ILP32-ILP32F-WITHFP-NEXT: addi a6, t6, -2048 ; ILP32-ILP32F-WITHFP-NEXT: sw t2, -32(s0) ; ILP32-ILP32F-WITHFP-NEXT: sw t1, -28(s0) ; ILP32-ILP32F-WITHFP-NEXT: sw t0, -24(s0) @@ -2364,36 +2364,36 @@ define void @va5_aligned_stack_caller() nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM: # %bb.0: ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, -64 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 262236 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, a0, 655 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a1, 377487 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, a1, 1475 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a2, 17 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a3, 16 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 20(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a2, 24(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a2, 15 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a3, 14 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 0(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a1, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a2, 16(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 262153 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, a0, 491 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 545260 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t0, a0, -1967 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 964690 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t1, a0, -328 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 335544 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t2, a0, 1311 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a0, 688509 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, a0, -2048 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a5, 262236 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a6, 377487 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a4, 17 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 16 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li t0, 15 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li t1, 14 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t2, 262153 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t3, 545260 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t4, 964690 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t5, 335544 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t6, 688509 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a0, 1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a1, 11 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, sp, 32 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a3, 12 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a7, 20(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 24(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a4, 13 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, a5, 655 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, a6, 1475 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t1, 0(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a6, 8(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a5, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 16(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 4 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, t2, 491 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t0, t3, -1967 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t1, t4, -328 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t2, t5, 1311 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, t6, -2048 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t2, 32(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t1, 36(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 40(sp) @@ -2410,37 +2410,37 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32E-FPELIM-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: li a0, 17 -; ILP32E-FPELIM-NEXT: li a1, 16 -; ILP32E-FPELIM-NEXT: li a2, 15 -; ILP32E-FPELIM-NEXT: sw a2, 16(sp) -; ILP32E-FPELIM-NEXT: sw a1, 20(sp) -; ILP32E-FPELIM-NEXT: sw a0, 24(sp) -; ILP32E-FPELIM-NEXT: lui a0, 262236 -; ILP32E-FPELIM-NEXT: addi a0, a0, 655 -; ILP32E-FPELIM-NEXT: lui a1, 377487 -; ILP32E-FPELIM-NEXT: addi a1, a1, 1475 -; ILP32E-FPELIM-NEXT: li a2, 14 -; ILP32E-FPELIM-NEXT: li a3, 4 -; ILP32E-FPELIM-NEXT: sw a3, 0(sp) -; ILP32E-FPELIM-NEXT: sw a2, 4(sp) -; ILP32E-FPELIM-NEXT: sw a1, 8(sp) -; ILP32E-FPELIM-NEXT: sw a0, 12(sp) -; ILP32E-FPELIM-NEXT: lui a0, 262153 -; ILP32E-FPELIM-NEXT: addi a6, a0, 491 -; ILP32E-FPELIM-NEXT: lui a0, 545260 -; ILP32E-FPELIM-NEXT: addi a7, a0, -1967 -; ILP32E-FPELIM-NEXT: lui a0, 964690 -; ILP32E-FPELIM-NEXT: addi t0, a0, -328 -; ILP32E-FPELIM-NEXT: lui a0, 335544 -; ILP32E-FPELIM-NEXT: addi t1, a0, 1311 -; ILP32E-FPELIM-NEXT: lui a0, 688509 -; ILP32E-FPELIM-NEXT: addi a5, a0, -2048 +; ILP32E-FPELIM-NEXT: li a3, 17 +; ILP32E-FPELIM-NEXT: li a4, 16 +; ILP32E-FPELIM-NEXT: li a5, 15 +; ILP32E-FPELIM-NEXT: lui a6, 262236 +; ILP32E-FPELIM-NEXT: lui a7, 377487 +; ILP32E-FPELIM-NEXT: li t0, 14 +; ILP32E-FPELIM-NEXT: li t1, 4 +; ILP32E-FPELIM-NEXT: lui t2, 262153 +; ILP32E-FPELIM-NEXT: lui t3, 545260 +; ILP32E-FPELIM-NEXT: lui t4, 964690 +; ILP32E-FPELIM-NEXT: lui t5, 335544 +; ILP32E-FPELIM-NEXT: lui t6, 688509 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 11 ; ILP32E-FPELIM-NEXT: addi a2, sp, 32 +; ILP32E-FPELIM-NEXT: sw a5, 16(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 24(sp) ; ILP32E-FPELIM-NEXT: li a3, 12 +; ILP32E-FPELIM-NEXT: addi a4, a6, 655 +; ILP32E-FPELIM-NEXT: addi a5, a7, 1475 +; ILP32E-FPELIM-NEXT: sw t1, 0(sp) +; ILP32E-FPELIM-NEXT: sw t0, 4(sp) +; ILP32E-FPELIM-NEXT: sw a5, 8(sp) +; ILP32E-FPELIM-NEXT: sw a4, 12(sp) ; ILP32E-FPELIM-NEXT: li a4, 13 +; ILP32E-FPELIM-NEXT: addi a6, t2, 491 +; ILP32E-FPELIM-NEXT: addi a7, t3, -1967 +; ILP32E-FPELIM-NEXT: addi t0, t4, -328 +; ILP32E-FPELIM-NEXT: addi t1, t5, 1311 +; ILP32E-FPELIM-NEXT: addi a5, t6, -2048 ; ILP32E-FPELIM-NEXT: sw t1, 32(sp) ; ILP32E-FPELIM-NEXT: sw t0, 36(sp) ; ILP32E-FPELIM-NEXT: sw a7, 40(sp) @@ -2459,37 +2459,37 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32E-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: li a0, 17 -; ILP32E-WITHFP-NEXT: li a1, 16 -; ILP32E-WITHFP-NEXT: li a2, 15 -; ILP32E-WITHFP-NEXT: sw a2, 16(sp) -; ILP32E-WITHFP-NEXT: sw a1, 20(sp) -; ILP32E-WITHFP-NEXT: sw a0, 24(sp) -; ILP32E-WITHFP-NEXT: lui a0, 262236 -; ILP32E-WITHFP-NEXT: addi a0, a0, 655 -; ILP32E-WITHFP-NEXT: lui a1, 377487 -; ILP32E-WITHFP-NEXT: addi a1, a1, 1475 -; ILP32E-WITHFP-NEXT: li a2, 14 -; ILP32E-WITHFP-NEXT: li a3, 4 -; ILP32E-WITHFP-NEXT: sw a3, 0(sp) -; ILP32E-WITHFP-NEXT: sw a2, 4(sp) -; ILP32E-WITHFP-NEXT: sw a1, 8(sp) -; ILP32E-WITHFP-NEXT: sw a0, 12(sp) -; ILP32E-WITHFP-NEXT: lui a0, 262153 -; ILP32E-WITHFP-NEXT: addi a6, a0, 491 -; ILP32E-WITHFP-NEXT: lui a0, 545260 -; ILP32E-WITHFP-NEXT: addi a7, a0, -1967 -; ILP32E-WITHFP-NEXT: lui a0, 964690 -; ILP32E-WITHFP-NEXT: addi t0, a0, -328 -; ILP32E-WITHFP-NEXT: lui a0, 335544 -; ILP32E-WITHFP-NEXT: addi t1, a0, 1311 -; ILP32E-WITHFP-NEXT: lui a0, 688509 -; ILP32E-WITHFP-NEXT: addi a5, a0, -2048 +; ILP32E-WITHFP-NEXT: li a3, 17 +; ILP32E-WITHFP-NEXT: li a4, 16 +; ILP32E-WITHFP-NEXT: li a5, 15 +; ILP32E-WITHFP-NEXT: lui a6, 262236 +; ILP32E-WITHFP-NEXT: lui a7, 377487 +; ILP32E-WITHFP-NEXT: li t0, 14 +; ILP32E-WITHFP-NEXT: li t1, 4 +; ILP32E-WITHFP-NEXT: lui t2, 262153 +; ILP32E-WITHFP-NEXT: lui t3, 545260 +; ILP32E-WITHFP-NEXT: lui t4, 964690 +; ILP32E-WITHFP-NEXT: lui t5, 335544 +; ILP32E-WITHFP-NEXT: lui t6, 688509 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 11 ; ILP32E-WITHFP-NEXT: addi a2, sp, 32 +; ILP32E-WITHFP-NEXT: sw a5, 16(sp) +; ILP32E-WITHFP-NEXT: sw a4, 20(sp) +; ILP32E-WITHFP-NEXT: sw a3, 24(sp) ; ILP32E-WITHFP-NEXT: li a3, 12 +; ILP32E-WITHFP-NEXT: addi a4, a6, 655 +; ILP32E-WITHFP-NEXT: addi a5, a7, 1475 +; ILP32E-WITHFP-NEXT: sw t1, 0(sp) +; ILP32E-WITHFP-NEXT: sw t0, 4(sp) +; ILP32E-WITHFP-NEXT: sw a5, 8(sp) +; ILP32E-WITHFP-NEXT: sw a4, 12(sp) ; ILP32E-WITHFP-NEXT: li a4, 13 +; ILP32E-WITHFP-NEXT: addi a6, t2, 491 +; ILP32E-WITHFP-NEXT: addi a7, t3, -1967 +; ILP32E-WITHFP-NEXT: addi t0, t4, -328 +; ILP32E-WITHFP-NEXT: addi t1, t5, 1311 +; ILP32E-WITHFP-NEXT: addi a5, t6, -2048 ; ILP32E-WITHFP-NEXT: sw t1, 32(sp) ; ILP32E-WITHFP-NEXT: sw t0, 36(sp) ; ILP32E-WITHFP-NEXT: sw a7, 40(sp) @@ -2508,21 +2508,21 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64-LP64F-LP64D-FPELIM-NEXT: li t0, 17 ; LP64-LP64F-LP64D-FPELIM-NEXT: li t1, 16 ; LP64-LP64F-LP64D-FPELIM-NEXT: li t2, 15 -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, %hi(.LCPI11_0) -; LP64-LP64F-LP64D-FPELIM-NEXT: ld t3, %lo(.LCPI11_0)(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, %hi(.LCPI11_1) -; LP64-LP64F-LP64D-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, %hi(.LCPI11_2) -; LP64-LP64F-LP64D-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 2384 -; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a6, a0, 761 -; LP64-LP64F-LP64D-FPELIM-NEXT: slli a6, a6, 11 +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a2, %hi(.LCPI11_0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a3, %hi(.LCPI11_1) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a6, %hi(.LCPI11_2) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui t3, 2384 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a0, 1 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a1, 11 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a4, 12 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a5, 13 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a7, 14 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd t3, 0(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: ld t4, %lo(.LCPI11_0)(a2) +; LP64-LP64F-LP64D-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(a3) +; LP64-LP64F-LP64D-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(a6) +; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a6, t3, 761 +; LP64-LP64F-LP64D-FPELIM-NEXT: slli a6, a6, 11 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd t4, 0(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd t2, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd t1, 16(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd t0, 24(sp) @@ -2540,21 +2540,21 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: li t0, 17 ; LP64-LP64F-LP64D-WITHFP-NEXT: li t1, 16 ; LP64-LP64F-LP64D-WITHFP-NEXT: li t2, 15 -; LP64-LP64F-LP64D-WITHFP-NEXT: lui a0, %hi(.LCPI11_0) -; LP64-LP64F-LP64D-WITHFP-NEXT: ld t3, %lo(.LCPI11_0)(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: lui a0, %hi(.LCPI11_1) -; LP64-LP64F-LP64D-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: lui a0, %hi(.LCPI11_2) -; LP64-LP64F-LP64D-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: lui a0, 2384 -; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a6, a0, 761 -; LP64-LP64F-LP64D-WITHFP-NEXT: slli a6, a6, 11 +; LP64-LP64F-LP64D-WITHFP-NEXT: lui a2, %hi(.LCPI11_0) +; LP64-LP64F-LP64D-WITHFP-NEXT: lui a3, %hi(.LCPI11_1) +; LP64-LP64F-LP64D-WITHFP-NEXT: lui a6, %hi(.LCPI11_2) +; LP64-LP64F-LP64D-WITHFP-NEXT: lui t3, 2384 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a0, 1 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a1, 11 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a4, 12 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a5, 13 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a7, 14 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd t3, 0(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: ld t4, %lo(.LCPI11_0)(a2) +; LP64-LP64F-LP64D-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(a3) +; LP64-LP64F-LP64D-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(a6) +; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a6, t3, 761 +; LP64-LP64F-LP64D-WITHFP-NEXT: slli a6, a6, 11 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd t4, 0(sp) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd t2, 8(sp) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd t1, 16(sp) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd t0, 24(sp) @@ -2568,25 +2568,25 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64E-FPELIM: # %bb.0: ; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: sd ra, 48(sp) # 8-byte Folded Spill -; LP64E-FPELIM-NEXT: li a0, 17 -; LP64E-FPELIM-NEXT: li a1, 16 -; LP64E-FPELIM-NEXT: sd a1, 32(sp) -; LP64E-FPELIM-NEXT: sd a0, 40(sp) +; LP64E-FPELIM-NEXT: li a2, 17 +; LP64E-FPELIM-NEXT: li a3, 16 ; LP64E-FPELIM-NEXT: li a6, 15 -; LP64E-FPELIM-NEXT: lui a0, %hi(.LCPI11_0) -; LP64E-FPELIM-NEXT: ld a7, %lo(.LCPI11_0)(a0) +; LP64E-FPELIM-NEXT: lui a7, %hi(.LCPI11_0) ; LP64E-FPELIM-NEXT: li t0, 14 -; LP64E-FPELIM-NEXT: lui a0, 2384 -; LP64E-FPELIM-NEXT: addiw a0, a0, 761 -; LP64E-FPELIM-NEXT: slli t1, a0, 11 -; LP64E-FPELIM-NEXT: lui a0, %hi(.LCPI11_1) -; LP64E-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(a0) -; LP64E-FPELIM-NEXT: lui a0, %hi(.LCPI11_2) -; LP64E-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(a0) +; LP64E-FPELIM-NEXT: lui t1, 2384 +; LP64E-FPELIM-NEXT: lui t2, %hi(.LCPI11_1) +; LP64E-FPELIM-NEXT: lui t3, %hi(.LCPI11_2) ; LP64E-FPELIM-NEXT: li a0, 1 ; LP64E-FPELIM-NEXT: li a1, 11 ; LP64E-FPELIM-NEXT: li a4, 12 +; LP64E-FPELIM-NEXT: sd a3, 32(sp) +; LP64E-FPELIM-NEXT: sd a2, 40(sp) ; LP64E-FPELIM-NEXT: li a5, 13 +; LP64E-FPELIM-NEXT: ld a7, %lo(.LCPI11_0)(a7) +; LP64E-FPELIM-NEXT: addiw t1, t1, 761 +; LP64E-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(t2) +; LP64E-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(t3) +; LP64E-FPELIM-NEXT: slli t1, t1, 11 ; LP64E-FPELIM-NEXT: sd t1, 0(sp) ; LP64E-FPELIM-NEXT: sd t0, 8(sp) ; LP64E-FPELIM-NEXT: sd a7, 16(sp) @@ -2602,25 +2602,25 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64E-WITHFP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill ; LP64E-WITHFP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; LP64E-WITHFP-NEXT: addi s0, sp, 64 -; LP64E-WITHFP-NEXT: li a0, 17 -; LP64E-WITHFP-NEXT: li a1, 16 -; LP64E-WITHFP-NEXT: sd a1, 32(sp) -; LP64E-WITHFP-NEXT: sd a0, 40(sp) +; LP64E-WITHFP-NEXT: li a2, 17 +; LP64E-WITHFP-NEXT: li a3, 16 ; LP64E-WITHFP-NEXT: li a6, 15 -; LP64E-WITHFP-NEXT: lui a0, %hi(.LCPI11_0) -; LP64E-WITHFP-NEXT: ld a7, %lo(.LCPI11_0)(a0) +; LP64E-WITHFP-NEXT: lui a7, %hi(.LCPI11_0) ; LP64E-WITHFP-NEXT: li t0, 14 -; LP64E-WITHFP-NEXT: lui a0, 2384 -; LP64E-WITHFP-NEXT: addiw a0, a0, 761 -; LP64E-WITHFP-NEXT: slli t1, a0, 11 -; LP64E-WITHFP-NEXT: lui a0, %hi(.LCPI11_1) -; LP64E-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(a0) -; LP64E-WITHFP-NEXT: lui a0, %hi(.LCPI11_2) -; LP64E-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(a0) +; LP64E-WITHFP-NEXT: lui t1, 2384 +; LP64E-WITHFP-NEXT: lui t2, %hi(.LCPI11_1) +; LP64E-WITHFP-NEXT: lui t3, %hi(.LCPI11_2) ; LP64E-WITHFP-NEXT: li a0, 1 ; LP64E-WITHFP-NEXT: li a1, 11 ; LP64E-WITHFP-NEXT: li a4, 12 +; LP64E-WITHFP-NEXT: sd a3, 32(sp) +; LP64E-WITHFP-NEXT: sd a2, 40(sp) ; LP64E-WITHFP-NEXT: li a5, 13 +; LP64E-WITHFP-NEXT: ld a7, %lo(.LCPI11_0)(a7) +; LP64E-WITHFP-NEXT: addiw t1, t1, 761 +; LP64E-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(t2) +; LP64E-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(t3) +; LP64E-WITHFP-NEXT: slli t1, t1, 11 ; LP64E-WITHFP-NEXT: sd t1, 0(sp) ; LP64E-WITHFP-NEXT: sd t0, 8(sp) ; LP64E-WITHFP-NEXT: sd a7, 16(sp) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index cae59c79aaaa8..437b7e557718c 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -5,18 +5,18 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 16 ; RV64I-NEXT: srli a3, a0, 24 @@ -30,25 +30,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -69,18 +69,18 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 16 ; RV64I-NEXT: srli a3, a0, 24 @@ -94,25 +94,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -133,18 +133,18 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sraw a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 16 ; RV64I-NEXT: srli a3, a0, 24 @@ -158,25 +158,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -198,95 +198,95 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t2, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 2(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: slli a4, a4, 35 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: lbu a4, 6(a0) +; RV32I-NEXT: lbu a5, 7(a0) +; RV32I-NEXT: lbu a6, 4(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a5, a1, 3 -; RV32I-NEXT: addi a4, a5, -32 -; RV32I-NEXT: srl a1, a3, a5 -; RV32I-NEXT: bltz a4, .LBB3_2 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: slli a4, a4, 3 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: srl a1, a5, a4 +; RV32I-NEXT: bltz a3, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB3_3 @@ -297,32 +297,32 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a0, 3(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli a5, a5, 1 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: srl a0, a0, a5 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: sll a3, a3, a5 -; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: sll a4, a5, a7 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: .LBB3_3: -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: and a1, a4, a1 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: srli a4, a0, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: and a1, a3, a1 ; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 24 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: sb a5, 5(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a1, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 @@ -334,95 +334,95 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t2, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 2(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: slli a4, a4, 35 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a5, a1, 3 -; RV32I-NEXT: addi a4, a5, -32 -; RV32I-NEXT: sll a1, a3, a5 -; RV32I-NEXT: bltz a4, .LBB4_2 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: slli a4, a4, 3 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: sll a1, a5, a4 +; RV32I-NEXT: bltz a3, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB4_3 @@ -433,32 +433,32 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a0, 7(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: srli a5, a5, 1 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: sll a0, a0, a5 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: srl a3, a3, a5 -; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: srl a4, a5, a7 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: .LBB4_3: -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: and a1, a4, a1 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: srli a4, a0, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: and a1, a3, a1 ; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 24 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb a7, 1(a2) ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: sb a4, 3(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb a4, 6(a2) +; RV32I-NEXT: sb a5, 7(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 @@ -470,67 +470,67 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t2, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 2(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: slli a4, a4, 35 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_8bytes: @@ -540,61 +540,61 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) ; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 1(a1) ; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a4, a6, 24 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a1, a6 -; RV32I-NEXT: slli a5, a5, 3 -; RV32I-NEXT: addi a6, a5, -32 -; RV32I-NEXT: sra a1, a3, a5 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a4, a5, 16 +; RV32I-NEXT: slli a5, a6, 24 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: or a3, a1, a7 +; RV32I-NEXT: slli a3, a3, 3 +; RV32I-NEXT: addi a6, a3, -32 +; RV32I-NEXT: sra a1, a4, a3 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a4, a4, 31 +; RV32I-NEXT: srai a5, a5, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 1(a0) ; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a4, a4, 1 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: or a0, a0, a4 -; RV32I-NEXT: srl a0, a0, a5 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: not a4, a5 -; RV32I-NEXT: sll a3, a3, a4 +; RV32I-NEXT: not a6, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: sll a3, a4, a6 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: .LBB5_3: ; RV32I-NEXT: srli a3, a1, 16 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a0, 16 +; RV32I-NEXT: srli a7, a0, 24 ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: sb a5, 5(a2) ; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a1, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: sb a6, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 @@ -607,232 +607,232 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 0(a1) -; RV64I-NEXT: lbu a7, 1(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a5, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a4, a4, 35 -; RV64I-NEXT: or a5, a4, a1 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: srl a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB6_2 +; RV64I-NEXT: slli a6, a5, 35 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a6, a1 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srl a0, a0, a4 +; RV64I-NEXT: sll a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB6_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 4(a2) +; RV64I-NEXT: sb a6, 5(a2) +; RV64I-NEXT: sb a5, 6(a2) +; RV64I-NEXT: sb a4, 7(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: sb t1, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: mv t2, sp +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a0, 12(sp) -; RV32I-NEXT: andi a0, a1, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: lw a5, 8(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a6, a4, a1 -; RV32I-NEXT: andi a7, a1, 24 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: sll t0, t0, a7 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: sll a4, a4, a7 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: srl a5, a5, a1 -; RV32I-NEXT: slli t1, a0, 1 -; RV32I-NEXT: sll a7, t1, a7 -; RV32I-NEXT: or a7, a5, a7 -; RV32I-NEXT: srl a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: slli a0, a1, 3 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: add a1, t2, a1 +; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: srl a7, a5, a0 +; RV32I-NEXT: slli t0, a6, 1 +; RV32I-NEXT: srl a4, a4, a0 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli t1, a1, 1 +; RV32I-NEXT: srl a0, a1, a0 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a5, a4, a5 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli a0, a7, 16 -; RV32I-NEXT: srli a1, a7, 24 -; RV32I-NEXT: srli a7, a7, 8 -; RV32I-NEXT: sb a5, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: srli t0, a3, 24 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli t1, a5, 16 +; RV32I-NEXT: srli t2, a5, 24 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: srli t3, a1, 16 +; RV32I-NEXT: srli t4, a1, 24 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a1, 11(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, t0, 16 -; RV32I-NEXT: srli a1, t0, 24 -; RV32I-NEXT: srli a3, t0, 8 -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb t0, 11(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t2, 3(a2) +; RV32I-NEXT: sb a7, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t3, 6(a2) +; RV32I-NEXT: sb t4, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -846,208 +846,208 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 0(a1) -; RV64I-NEXT: lbu a7, 1(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a5, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 -; RV64I-NEXT: slli a4, a4, 37 -; RV64I-NEXT: or a5, a4, a1 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: srl a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB7_2 +; RV64I-NEXT: slli a6, a5, 37 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a6, a1 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srl a0, a0, a4 +; RV64I-NEXT: sll a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB7_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 4(a2) +; RV64I-NEXT: sb a6, 5(a2) +; RV64I-NEXT: sb a5, 6(a2) +; RV64I-NEXT: sb a4, 7(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: sb t1, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_16bytes_wordOff: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) +; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: or a0, a0, a7 ; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: mv t2, sp +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t1, a5 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: add a1, t2, a1 ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a0, 12(sp) -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lw a1, 8(a0) -; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srli a5, a1, 16 -; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: srli a7, a1, 8 -; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: srli t3, a4, 16 +; RV32I-NEXT: srli t4, a4, 24 +; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t6, a3, 16 +; RV32I-NEXT: sb a0, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: srli a6, a0, 8 -; RV32I-NEXT: sb a0, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a5, 15(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a5, a4, 8 +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) +; RV32I-NEXT: srli a1, a3, 8 ; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 +; RV32I-NEXT: sb t5, 1(a2) +; RV32I-NEXT: sb t3, 2(a2) +; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t6, 6(a2) +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1061,232 +1061,232 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 0(a1) -; RV64I-NEXT: lbu a7, 1(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a5, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a4, a4, 35 -; RV64I-NEXT: or a5, a4, a1 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: sll a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB8_2 +; RV64I-NEXT: slli a6, a5, 35 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a6, a1 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: lbu t1, 13(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 13(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sll a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: srli a3, a3, 1 -; RV64I-NEXT: srl a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: srl a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB8_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a6, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb a4, 15(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a4, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb t2, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t1, 10(a2) +; RV64I-NEXT: sb t0, 11(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: addi t2, sp, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a3, 16(sp) ; RV32I-NEXT: sw a4, 20(sp) ; RV32I-NEXT: sw a5, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: andi a0, a1, 12 -; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a3, a3, a0 -; RV32I-NEXT: lw a0, 0(a3) -; RV32I-NEXT: lw a4, 4(a3) -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: lw a5, 8(a3) -; RV32I-NEXT: lw a3, 12(a3) -; RV32I-NEXT: sll a6, a4, a1 -; RV32I-NEXT: andi a7, a1, 24 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: srli t0, a0, 1 -; RV32I-NEXT: srl t0, t0, a7 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sll a3, a3, a1 -; RV32I-NEXT: srli t1, a5, 1 -; RV32I-NEXT: srl t1, t1, a7 -; RV32I-NEXT: or t1, a3, t1 -; RV32I-NEXT: sll a5, a5, a1 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: srl a4, a4, a7 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: sll a0, a0, a1 -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: srli a1, a4, 16 -; RV32I-NEXT: srli a7, a4, 8 -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) -; RV32I-NEXT: sb a1, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: srli a1, t1, 16 -; RV32I-NEXT: srli a4, t1, 8 -; RV32I-NEXT: sb t1, 12(a2) -; RV32I-NEXT: sb a4, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: slli a0, a1, 3 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: sub a1, t2, a1 +; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: sll a7, a5, a0 +; RV32I-NEXT: srli t0, a4, 1 +; RV32I-NEXT: sll a1, a1, a0 +; RV32I-NEXT: srli t1, a6, 1 +; RV32I-NEXT: sll a6, a6, a0 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: sll a0, a4, a0 +; RV32I-NEXT: srl a4, t0, a3 +; RV32I-NEXT: srl t0, t1, a3 +; RV32I-NEXT: srl a3, a5, a3 +; RV32I-NEXT: srli a5, a6, 24 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a0, 16 +; RV32I-NEXT: srli t3, a0, 24 +; RV32I-NEXT: srli t4, a0, 8 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: srli a7, a7, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a1, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a0, a6, 24 -; RV32I-NEXT: srli a1, t0, 16 -; RV32I-NEXT: srli a3, t0, 8 -; RV32I-NEXT: sb t0, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb t2, 2(a2) +; RV32I-NEXT: sb t3, 3(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: srli t3, a4, 16 +; RV32I-NEXT: srli t4, a4, 8 +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a6, 9(a2) +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: sb t4, 5(a2) +; RV32I-NEXT: sb t3, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1300,208 +1300,208 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 0(a1) -; RV64I-NEXT: lbu a7, 1(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a5, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 -; RV64I-NEXT: slli a4, a4, 37 -; RV64I-NEXT: or a5, a4, a1 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: sll a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB9_2 +; RV64I-NEXT: slli a6, a5, 37 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a6, a1 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB9_3 ; RV64I-NEXT: .LBB9_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: lbu t1, 13(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 13(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sll a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: srli a3, a3, 1 -; RV64I-NEXT: srl a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: srl a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB9_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a6, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb a4, 15(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a4, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb t2, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t1, 10(a2) +; RV64I-NEXT: sb t0, 11(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_16bytes_wordOff: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) +; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: or a0, a0, a7 ; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw a3, 16(sp) -; RV32I-NEXT: sw a4, 20(sp) -; RV32I-NEXT: sw a5, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: addi t2, sp, 16 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: addi a0, sp, 16 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lw a1, 8(a0) -; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srli a5, a1, 16 -; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: srli a7, a1, 8 -; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t1, a5 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: sub a1, t2, a1 +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: srli t3, a4, 16 +; RV32I-NEXT: srli t4, a4, 24 +; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t6, a3, 16 +; RV32I-NEXT: sb a0, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: srli a6, a0, 8 -; RV32I-NEXT: sb a0, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a5, 15(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a5, a4, 8 +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) +; RV32I-NEXT: srli a1, a3, 8 ; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 +; RV32I-NEXT: sb t5, 1(a2) +; RV32I-NEXT: sb t3, 2(a2) +; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t6, 6(a2) +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1516,233 +1516,233 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 ; RV64I-NEXT: lbu a6, 4(a1) ; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu t0, 7(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 ; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, t1, a5 +; RV64I-NEXT: or a4, t0, a6 ; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a5, a5, 35 -; RV64I-NEXT: or a5, a5, a1 -; RV64I-NEXT: addi a6, a5, -64 -; RV64I-NEXT: sra a1, a3, a5 +; RV64I-NEXT: slli a7, a4, 35 +; RV64I-NEXT: or a4, a6, a3 +; RV64I-NEXT: or a3, a7, a1 +; RV64I-NEXT: addi a6, a3, -64 +; RV64I-NEXT: sra a1, a4, a3 ; RV64I-NEXT: bltz a6, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a4, 31 +; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu a5, 1(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) +; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a4, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a4 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: srl a0, a0, a3 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB10_3: ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t0, a1, 16 +; RV64I-NEXT: srli t1, a1, 8 +; RV64I-NEXT: srli t2, a0, 56 +; RV64I-NEXT: srli t3, a0, 48 +; RV64I-NEXT: srli t4, a0, 40 +; RV64I-NEXT: srli t5, a0, 32 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a0, 24 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: sb t1, 9(a2) +; RV64I-NEXT: sb t0, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb t5, 4(a2) +; RV64I-NEXT: sb t4, 5(a2) +; RV64I-NEXT: sb t3, 6(a2) +; RV64I-NEXT: sb t2, 7(a2) ; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a3, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu t0, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, a0, a6 -; RV32I-NEXT: lbu t0, 0(a1) -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or t3, t5, t4 +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 1(a1) ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: mv t1, sp +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t2, a0, t2 ; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or a6, t2, t0 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) ; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: andi a0, a1, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: lw a5, 8(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a6, a4, a1 -; RV32I-NEXT: andi a7, a1, 24 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: sll t0, t0, a7 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: sll a4, a4, a7 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: srl a5, a5, a1 -; RV32I-NEXT: slli t1, a0, 1 -; RV32I-NEXT: sll a7, t1, a7 -; RV32I-NEXT: or a7, a5, a7 -; RV32I-NEXT: sra a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: slli a0, a1, 3 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: add a1, t1, a1 +; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: srl a7, a5, a0 +; RV32I-NEXT: slli t0, a6, 1 +; RV32I-NEXT: srl a4, a4, a0 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli t1, a1, 1 +; RV32I-NEXT: sra a0, a1, a0 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a5, a4, a5 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli a0, a7, 16 -; RV32I-NEXT: srli a1, a7, 24 -; RV32I-NEXT: srli a7, a7, 8 -; RV32I-NEXT: sb a5, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: srli t0, a3, 24 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli t1, a5, 16 +; RV32I-NEXT: srli t2, a5, 24 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: srli t3, a1, 16 +; RV32I-NEXT: srli t4, a1, 24 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a1, 11(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, t0, 16 -; RV32I-NEXT: srli a1, t0, 24 -; RV32I-NEXT: srli a3, t0, 8 -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb t0, 11(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: sb t2, 3(a2) +; RV32I-NEXT: sb a7, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t3, 6(a2) +; RV32I-NEXT: sb t4, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1756,209 +1756,209 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 ; RV64I-NEXT: lbu a6, 4(a1) ; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu t0, 7(a1) +; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu t3, 7(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 ; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu t2, 1(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, t1, a5 +; RV64I-NEXT: or a4, t0, a6 ; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 5 -; RV64I-NEXT: slli a5, a5, 37 -; RV64I-NEXT: or a5, a5, a1 -; RV64I-NEXT: addi a6, a5, -64 -; RV64I-NEXT: sra a1, a3, a5 +; RV64I-NEXT: slli a7, a4, 37 +; RV64I-NEXT: or a4, a6, a3 +; RV64I-NEXT: or a3, a7, a1 +; RV64I-NEXT: addi a6, a3, -64 +; RV64I-NEXT: sra a1, a4, a3 ; RV64I-NEXT: bltz a6, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a4, 31 +; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: j .LBB11_3 ; RV64I-NEXT: .LBB11_2: -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu a5, 1(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) +; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a4, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a4 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: srl a0, a0, a3 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB11_3: ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t0, a1, 16 +; RV64I-NEXT: srli t1, a1, 8 +; RV64I-NEXT: srli t2, a0, 56 +; RV64I-NEXT: srli t3, a0, 48 +; RV64I-NEXT: srli t4, a0, 40 +; RV64I-NEXT: srli t5, a0, 32 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a0, 24 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: sb t1, 9(a2) +; RV64I-NEXT: sb t0, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb t5, 4(a2) +; RV64I-NEXT: sb t4, 5(a2) +; RV64I-NEXT: sb t3, 6(a2) +; RV64I-NEXT: sb t2, 7(a2) ; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a3, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_16bytes_wordOff: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) +; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, a0, a6 -; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: mv t2, sp +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, a0, t0 ; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t1, a5 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: add a1, t2, a1 ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lw a1, 8(a0) -; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srli a5, a1, 16 -; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: srli a7, a1, 8 -; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: srli t3, a4, 16 +; RV32I-NEXT: srli t4, a4, 24 +; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t6, a3, 16 +; RV32I-NEXT: sb a0, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: srli a6, a0, 8 -; RV32I-NEXT: sb a0, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a5, 15(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a5, a4, 8 +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) +; RV32I-NEXT: srli a1, a3, 8 ; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 +; RV32I-NEXT: sb t5, 1(a2) +; RV32I-NEXT: sb t3, 2(a2) +; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t6, 6(a2) +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1972,428 +1972,472 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: andi a0, a1, 24 -; RV64I-NEXT: mv a3, sp -; RV64I-NEXT: add a0, a3, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: ld a4, 8(a0) -; RV64I-NEXT: slli a5, a1, 3 -; RV64I-NEXT: ld a6, 16(a0) -; RV64I-NEXT: ld a7, 24(a0) -; RV64I-NEXT: srl a0, a4, a5 -; RV64I-NEXT: andi a1, a5, 56 -; RV64I-NEXT: xori t0, a1, 63 -; RV64I-NEXT: slli a1, a6, 1 -; RV64I-NEXT: sll a1, a1, t0 -; RV64I-NEXT: or a1, a0, a1 -; RV64I-NEXT: srl a3, a3, a5 -; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: sll a4, a4, t0 -; RV64I-NEXT: or a4, a3, a4 -; RV64I-NEXT: srl a6, a6, a5 -; RV64I-NEXT: slli t1, a7, 1 -; RV64I-NEXT: sll t0, t1, t0 -; RV64I-NEXT: or t0, a6, t0 -; RV64I-NEXT: srl a5, a7, a5 -; RV64I-NEXT: srli a7, a5, 56 -; RV64I-NEXT: srli t1, a5, 48 -; RV64I-NEXT: srli t2, a5, 40 -; RV64I-NEXT: srli t3, a5, 32 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: slli a4, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: add a1, s6, a1 +; RV64I-NEXT: andi a0, a4, 56 +; RV64I-NEXT: ld a3, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 24(a1) +; RV64I-NEXT: srl a0, a5, a4 +; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a1, a3, a4 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a3, a6, a4 +; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl t0, t0, a4 +; RV64I-NEXT: sll a4, t1, a7 +; RV64I-NEXT: sll a5, a5, a7 +; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: srli a7, t0, 56 +; RV64I-NEXT: srli t1, t0, 48 +; RV64I-NEXT: srli t2, t0, 40 +; RV64I-NEXT: srli t3, t0, 32 +; RV64I-NEXT: srli t4, t0, 24 +; RV64I-NEXT: srli t5, t0, 16 +; RV64I-NEXT: srli t6, t0, 8 +; RV64I-NEXT: or a4, a0, a4 +; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: or a6, a3, a6 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) ; RV64I-NEXT: sb a7, 31(a2) -; RV64I-NEXT: srli a7, a5, 24 -; RV64I-NEXT: srli t1, a5, 16 -; RV64I-NEXT: srli t2, a5, 8 -; RV64I-NEXT: sb a5, 24(a2) -; RV64I-NEXT: sb t2, 25(a2) -; RV64I-NEXT: sb t1, 26(a2) -; RV64I-NEXT: sb a7, 27(a2) -; RV64I-NEXT: srli a5, t0, 56 -; RV64I-NEXT: srli a7, t0, 48 -; RV64I-NEXT: srli t1, t0, 40 -; RV64I-NEXT: srli t2, t0, 32 +; RV64I-NEXT: sb t0, 24(a2) +; RV64I-NEXT: sb t6, 25(a2) +; RV64I-NEXT: sb t5, 26(a2) +; RV64I-NEXT: sb t4, 27(a2) +; RV64I-NEXT: srli a7, a6, 56 +; RV64I-NEXT: srli t0, a6, 48 +; RV64I-NEXT: srli t1, a6, 40 +; RV64I-NEXT: srli t2, a6, 32 +; RV64I-NEXT: srli t3, a6, 24 +; RV64I-NEXT: srli t4, a6, 16 +; RV64I-NEXT: srli a6, a6, 8 +; RV64I-NEXT: srli t5, a5, 56 +; RV64I-NEXT: srli t6, a5, 48 +; RV64I-NEXT: srli s0, a5, 40 +; RV64I-NEXT: srli s1, a5, 32 +; RV64I-NEXT: srli s2, a5, 24 +; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli s4, a4, 56 +; RV64I-NEXT: srli s5, a4, 48 +; RV64I-NEXT: srli s6, a4, 40 ; RV64I-NEXT: sb t2, 20(a2) ; RV64I-NEXT: sb t1, 21(a2) -; RV64I-NEXT: sb a7, 22(a2) -; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, t0, 24 -; RV64I-NEXT: srli a7, t0, 16 -; RV64I-NEXT: srli t0, t0, 8 -; RV64I-NEXT: sb a6, 16(a2) -; RV64I-NEXT: sb t0, 17(a2) -; RV64I-NEXT: sb a7, 18(a2) -; RV64I-NEXT: sb a5, 19(a2) -; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: srli a7, a4, 40 -; RV64I-NEXT: srli t0, a4, 32 -; RV64I-NEXT: sb t0, 4(a2) -; RV64I-NEXT: sb a7, 5(a2) -; RV64I-NEXT: sb a6, 6(a2) -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a4, 24 +; RV64I-NEXT: sb t0, 22(a2) +; RV64I-NEXT: sb a7, 23(a2) +; RV64I-NEXT: srli a7, a4, 32 +; RV64I-NEXT: sb a3, 16(a2) +; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb t4, 18(a2) +; RV64I-NEXT: sb t3, 19(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb s1, 4(a2) +; RV64I-NEXT: sb s0, 5(a2) +; RV64I-NEXT: sb t6, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) ; RV64I-NEXT: srli a6, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a3, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a6, 2(a2) -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb s3, 2(a2) +; RV64I-NEXT: sb s2, 3(a2) +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb s6, 13(a2) +; RV64I-NEXT: sb s5, 14(a2) +; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb a6, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -80 -; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu s0, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s6, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s7, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu t5, 20(a0) +; RV32I-NEXT: lbu t6, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) -; RV32I-NEXT: or a0, a0, t3 -; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s3, t2 +; RV32I-NEXT: or t3, s8, s6 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s7, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t2, t4, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t3 -; RV32I-NEXT: or a1, a1, t2 -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: andi a0, a1, 28 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a6, a3, a0 -; RV32I-NEXT: lw a3, 0(a6) -; RV32I-NEXT: lw a4, 4(a6) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s0, t1 +; RV32I-NEXT: or t1, s2, s1 +; RV32I-NEXT: or s0, s6, s3 +; RV32I-NEXT: or s1, s9, s8 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, t5 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw a7, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw s0, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: slli t1, a1, 3 -; RV32I-NEXT: lw a7, 8(a6) -; RV32I-NEXT: lw t0, 12(a6) +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: add a1, t6, a1 +; RV32I-NEXT: andi a0, t1, 24 +; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t2, 20(a1) +; RV32I-NEXT: lw t3, 24(a1) +; RV32I-NEXT: lw t4, 28(a1) ; RV32I-NEXT: srl a0, a4, t1 -; RV32I-NEXT: andi a1, t1, 24 -; RV32I-NEXT: xori t2, a1, 31 -; RV32I-NEXT: slli a1, a7, 1 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: srl a3, a3, t1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: sll a4, a4, t2 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: srl a5, t0, t1 -; RV32I-NEXT: lw t3, 16(a6) -; RV32I-NEXT: lw t4, 20(a6) -; RV32I-NEXT: lw t5, 24(a6) -; RV32I-NEXT: lw t6, 28(a6) -; RV32I-NEXT: slli a6, t3, 1 -; RV32I-NEXT: sll a6, a6, t2 -; RV32I-NEXT: or a6, a5, a6 -; RV32I-NEXT: srl a7, a7, t1 -; RV32I-NEXT: slli t0, t0, 1 -; RV32I-NEXT: sll t0, t0, t2 -; RV32I-NEXT: or t0, a7, t0 -; RV32I-NEXT: srl s0, t4, t1 -; RV32I-NEXT: slli s1, t5, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s1, s0, s1 -; RV32I-NEXT: srl t3, t3, t1 -; RV32I-NEXT: slli t4, t4, 1 -; RV32I-NEXT: sll t4, t4, t2 -; RV32I-NEXT: or t4, t3, t4 -; RV32I-NEXT: srl t5, t5, t1 -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t2, s2, t2 -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: srl t1, t6, t1 -; RV32I-NEXT: srli t6, t1, 24 -; RV32I-NEXT: srli s2, t1, 16 -; RV32I-NEXT: srli s3, t1, 8 +; RV32I-NEXT: slli t5, a5, 1 +; RV32I-NEXT: srl a1, a3, t1 +; RV32I-NEXT: slli t6, a4, 1 +; RV32I-NEXT: srl a3, a6, t1 +; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: srl a4, a5, t1 +; RV32I-NEXT: slli s1, a6, 1 +; RV32I-NEXT: srl a5, t2, t1 +; RV32I-NEXT: slli s2, t3, 1 +; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: slli t3, t4, 1 +; RV32I-NEXT: srl t1, t4, t1 +; RV32I-NEXT: sll t4, t5, t0 +; RV32I-NEXT: sll t5, t6, t0 +; RV32I-NEXT: sll t6, s0, t0 +; RV32I-NEXT: sll s0, s1, t0 +; RV32I-NEXT: sll s1, s2, t0 +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: srli s2, t1, 24 +; RV32I-NEXT: srli s3, t1, 16 +; RV32I-NEXT: srli s4, t1, 8 +; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or t4, a1, t5 +; RV32I-NEXT: or t5, a3, t6 +; RV32I-NEXT: or s0, a4, s0 +; RV32I-NEXT: or s1, a5, s1 +; RV32I-NEXT: or t2, a6, t2 +; RV32I-NEXT: or t3, a7, t3 ; RV32I-NEXT: sb t1, 28(a2) -; RV32I-NEXT: sb s3, 29(a2) -; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb t6, 31(a2) -; RV32I-NEXT: srli t1, t2, 24 -; RV32I-NEXT: srli t6, t2, 16 +; RV32I-NEXT: sb s4, 29(a2) +; RV32I-NEXT: sb s3, 30(a2) +; RV32I-NEXT: sb s2, 31(a2) +; RV32I-NEXT: srli t1, t3, 24 +; RV32I-NEXT: srli t6, t3, 16 +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: srli s2, t2, 24 +; RV32I-NEXT: srli s3, t2, 16 ; RV32I-NEXT: srli t2, t2, 8 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli s4, s1, 24 +; RV32I-NEXT: srli s5, s1, 16 +; RV32I-NEXT: srli s1, s1, 8 +; RV32I-NEXT: srli s6, s0, 24 +; RV32I-NEXT: srli s7, s0, 16 +; RV32I-NEXT: srli s0, s0, 8 +; RV32I-NEXT: srli s8, t5, 24 +; RV32I-NEXT: srli s9, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: srli s10, t4, 24 +; RV32I-NEXT: srli s11, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t6, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, t4, 24 -; RV32I-NEXT: srli t2, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t3, 16(a2) -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: sb t2, 18(a2) -; RV32I-NEXT: sb t1, 19(a2) -; RV32I-NEXT: srli t1, s1, 24 -; RV32I-NEXT: srli t2, s1, 16 -; RV32I-NEXT: srli s1, s1, 8 -; RV32I-NEXT: sb s0, 20(a2) -; RV32I-NEXT: sb s1, 21(a2) -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: sb t1, 23(a2) -; RV32I-NEXT: srli t1, t0, 24 -; RV32I-NEXT: srli t2, t0, 16 +; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb t2, 17(a2) +; RV32I-NEXT: sb s3, 18(a2) +; RV32I-NEXT: sb s2, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: srli t0, t0, 8 -; RV32I-NEXT: sb a7, 8(a2) -; RV32I-NEXT: sb t0, 9(a2) -; RV32I-NEXT: sb t2, 10(a2) -; RV32I-NEXT: sb t1, 11(a2) -; RV32I-NEXT: srli a7, a6, 24 -; RV32I-NEXT: srli t0, a6, 16 -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a5, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb t0, 14(a2) -; RV32I-NEXT: sb a7, 15(a2) -; RV32I-NEXT: srli a5, a4, 24 -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a6, 2(a2) -; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: sb s5, 22(a2) +; RV32I-NEXT: sb s4, 23(a2) +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s0, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb t5, 13(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: sb a4, 6(a2) -; RV32I-NEXT: sb a3, 7(a2) -; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 80 +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2406,381 +2450,431 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: slli a0, a1, 2 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: mv a3, sp -; RV64I-NEXT: add a0, a3, a0 -; RV64I-NEXT: ld a4, 0(a0) -; RV64I-NEXT: ld a5, 8(a0) -; RV64I-NEXT: slli a6, a1, 5 -; RV64I-NEXT: ld a7, 16(a0) -; RV64I-NEXT: ld t0, 24(a0) -; RV64I-NEXT: srl a3, a5, a6 -; RV64I-NEXT: andi a0, a6, 32 -; RV64I-NEXT: xori t1, a0, 63 -; RV64I-NEXT: slli a0, a7, 1 -; RV64I-NEXT: sll a0, a0, t1 -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: srl t2, a4, a6 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: slli a3, a1, 5 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: andi a0, a3, 32 +; RV64I-NEXT: add a1, s6, a1 +; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 24(a1) +; RV64I-NEXT: srl a0, a5, a3 +; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a1, a4, a3 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: sll a1, a5, t1 -; RV64I-NEXT: or a1, t2, a1 -; RV64I-NEXT: srl a5, a7, a6 -; RV64I-NEXT: slli a4, t0, 1 -; RV64I-NEXT: sll a4, a4, t1 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: srl a6, t0, a6 -; RV64I-NEXT: srli a7, a5, 24 -; RV64I-NEXT: srli t0, a5, 16 -; RV64I-NEXT: srli t1, a5, 8 -; RV64I-NEXT: sb a5, 16(a2) -; RV64I-NEXT: sb t1, 17(a2) -; RV64I-NEXT: sb t0, 18(a2) +; RV64I-NEXT: srl a4, a6, a3 +; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl a3, t0, a3 +; RV64I-NEXT: sll t0, t1, a7 +; RV64I-NEXT: sll a5, a5, a7 +; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: srli a7, a4, 24 +; RV64I-NEXT: srli t1, a4, 16 +; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: srli t3, a3, 56 +; RV64I-NEXT: srli t4, a3, 48 +; RV64I-NEXT: srli t5, a3, 40 +; RV64I-NEXT: srli t6, a3, 32 +; RV64I-NEXT: srli s0, a3, 24 +; RV64I-NEXT: srli s1, a3, 16 +; RV64I-NEXT: srli s2, a3, 8 +; RV64I-NEXT: srli s3, a1, 24 +; RV64I-NEXT: srli s4, a1, 16 +; RV64I-NEXT: srli s5, a1, 8 +; RV64I-NEXT: srli s6, a0, 24 +; RV64I-NEXT: or a6, a4, a6 +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb t2, 17(a2) +; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a5, a6, 56 -; RV64I-NEXT: srli a7, a6, 48 -; RV64I-NEXT: srli t0, a6, 40 -; RV64I-NEXT: srli t1, a6, 32 -; RV64I-NEXT: sb t1, 28(a2) -; RV64I-NEXT: sb t0, 29(a2) -; RV64I-NEXT: sb a7, 30(a2) -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a6, 24 -; RV64I-NEXT: srli a7, a6, 16 -; RV64I-NEXT: srli t0, a6, 8 -; RV64I-NEXT: sb a6, 24(a2) -; RV64I-NEXT: sb t0, 25(a2) -; RV64I-NEXT: sb a7, 26(a2) -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, t2, 24 -; RV64I-NEXT: srli a6, t2, 16 -; RV64I-NEXT: srli a7, t2, 8 -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb a7, 1(a2) -; RV64I-NEXT: sb a6, 2(a2) -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a5, a3, 24 -; RV64I-NEXT: srli a6, a3, 16 -; RV64I-NEXT: srli a7, a3, 8 -; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: sb t6, 28(a2) +; RV64I-NEXT: sb t5, 29(a2) +; RV64I-NEXT: sb t4, 30(a2) +; RV64I-NEXT: sb t3, 31(a2) +; RV64I-NEXT: srli a7, a0, 8 +; RV64I-NEXT: or t0, a0, t0 +; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb s2, 25(a2) +; RV64I-NEXT: sb s1, 26(a2) +; RV64I-NEXT: sb s0, 27(a2) +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s5, 1(a2) +; RV64I-NEXT: sb s4, 2(a2) +; RV64I-NEXT: sb s3, 3(a2) +; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a3, a4, 56 -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: srli a6, a4, 40 -; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb s6, 11(a2) +; RV64I-NEXT: srli a0, a6, 56 +; RV64I-NEXT: srli a1, a6, 48 +; RV64I-NEXT: srli a3, a6, 40 +; RV64I-NEXT: srli a4, a6, 32 +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: srli a7, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli a5, a5, 32 +; RV64I-NEXT: srli t2, t0, 56 +; RV64I-NEXT: srli t3, t0, 48 +; RV64I-NEXT: srli t4, t0, 40 +; RV64I-NEXT: srli t0, t0, 32 ; RV64I-NEXT: sb a4, 20(a2) -; RV64I-NEXT: sb a6, 21(a2) -; RV64I-NEXT: sb a5, 22(a2) -; RV64I-NEXT: sb a3, 23(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a0, a0, 32 -; RV64I-NEXT: sb a0, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a3, 21(a2) +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb t1, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: sb t0, 12(a2) +; RV64I-NEXT: sb t4, 13(a2) +; RV64I-NEXT: sb t3, 14(a2) +; RV64I-NEXT: sb t2, 15(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu s1, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s4, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s10, 9(a0) +; RV32I-NEXT: lbu s11, 10(a0) +; RV32I-NEXT: lbu ra, 11(a0) +; RV32I-NEXT: lbu t4, 12(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a6, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t5, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu s0, 21(a0) +; RV32I-NEXT: lbu s2, 22(a0) +; RV32I-NEXT: lbu s3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s1, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: or s4, s6, s4 +; RV32I-NEXT: or s5, s10, s5 +; RV32I-NEXT: or s6, ra, s11 +; RV32I-NEXT: lbu s10, 28(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu ra, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: or a0, a0, t3 ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s7, s1 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or a0, a0, ra ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s4, t1 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: or a5, a5, t4 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a0, a0, s0 +; RV32I-NEXT: add t6, t6, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw a6, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a7, 8(sp) +; RV32I-NEXT: sw t0, 12(sp) +; RV32I-NEXT: sw t1, 16(sp) +; RV32I-NEXT: sw a5, 20(sp) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a1, 0(t6) +; RV32I-NEXT: lw a0, 4(t6) +; RV32I-NEXT: lw a4, 8(t6) +; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -2793,344 +2887,394 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes_dwordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t2, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t3, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t4, 18(a0) +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t2 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t2, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: sd a3, 0(sp) -; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t3, t1 +; RV64I-NEXT: mv t3, sp +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a4, t6, a4 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t4, s8, s3 +; RV64I-NEXT: or t5, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: mv a0, sp -; RV64I-NEXT: add a3, a0, a1 -; RV64I-NEXT: ld a4, 16(a3) -; RV64I-NEXT: ld a0, 8(a3) -; RV64I-NEXT: ld a1, 0(a3) -; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t1, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: add t3, t3, a1 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: sd a1, 0(sp) +; RV64I-NEXT: sd a5, 8(sp) +; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: ld a4, 16(t3) +; RV64I-NEXT: ld a0, 8(t3) +; RV64I-NEXT: ld a1, 0(t3) +; RV64I-NEXT: ld a3, 24(t3) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 ; RV64I-NEXT: srli t0, a4, 32 +; RV64I-NEXT: srli t1, a4, 24 +; RV64I-NEXT: srli t2, a4, 16 +; RV64I-NEXT: srli t3, a4, 8 +; RV64I-NEXT: srli t4, a3, 56 +; RV64I-NEXT: srli t5, a3, 48 +; RV64I-NEXT: srli t6, a3, 40 +; RV64I-NEXT: srli s0, a3, 32 +; RV64I-NEXT: srli s1, a3, 24 +; RV64I-NEXT: srli s2, a3, 16 +; RV64I-NEXT: srli s3, a3, 8 +; RV64I-NEXT: srli s4, a1, 56 +; RV64I-NEXT: srli s5, a1, 48 +; RV64I-NEXT: srli s6, a1, 40 +; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) ; RV64I-NEXT: sb a7, 21(a2) ; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a7, 17(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb a5, 19(a2) -; RV64I-NEXT: srli a4, a3, 56 -; RV64I-NEXT: srli a5, a3, 48 -; RV64I-NEXT: srli a6, a3, 40 -; RV64I-NEXT: srli a7, a3, 32 -; RV64I-NEXT: sb a7, 28(a2) -; RV64I-NEXT: sb a6, 29(a2) -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: sb t3, 17(a2) +; RV64I-NEXT: sb t2, 18(a2) +; RV64I-NEXT: sb t1, 19(a2) +; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: sb s0, 28(a2) +; RV64I-NEXT: sb t6, 29(a2) +; RV64I-NEXT: sb t5, 30(a2) +; RV64I-NEXT: sb t4, 31(a2) +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) -; RV64I-NEXT: sb a6, 25(a2) -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a4, 27(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb s3, 25(a2) +; RV64I-NEXT: sb s2, 26(a2) +; RV64I-NEXT: sb s1, 27(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb s7, 4(a2) +; RV64I-NEXT: sb s6, 5(a2) +; RV64I-NEXT: sb s5, 6(a2) +; RV64I-NEXT: sb s4, 7(a2) +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu s1, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s4, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s10, 9(a0) +; RV32I-NEXT: lbu s11, 10(a0) +; RV32I-NEXT: lbu ra, 11(a0) +; RV32I-NEXT: lbu t4, 12(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a6, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t5, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu s0, 21(a0) +; RV32I-NEXT: lbu s2, 22(a0) +; RV32I-NEXT: lbu s3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s1, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: or s4, s6, s4 +; RV32I-NEXT: or s5, s10, s5 +; RV32I-NEXT: or s6, ra, s11 +; RV32I-NEXT: lbu s10, 28(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu ra, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: or a0, a0, t3 ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 8 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s7, s1 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or a0, a0, ra ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s4, t1 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: or a5, a5, t4 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a0, a0, s0 +; RV32I-NEXT: add t6, t6, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw a6, 32(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a7, 8(sp) +; RV32I-NEXT: sw t0, 12(sp) +; RV32I-NEXT: sw t1, 16(sp) +; RV32I-NEXT: sw a5, 20(sp) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a1, 0(t6) +; RV32I-NEXT: lw a0, 4(t6) +; RV32I-NEXT: lw a4, 8(t6) +; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -3143,428 +3287,472 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: addi s6, sp, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a5, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: andi a0, a1, 24 -; RV64I-NEXT: addi a3, sp, 32 -; RV64I-NEXT: sub a3, a3, a0 -; RV64I-NEXT: ld a5, 0(a3) -; RV64I-NEXT: ld a6, 8(a3) -; RV64I-NEXT: slli a7, a1, 3 -; RV64I-NEXT: ld t0, 16(a3) -; RV64I-NEXT: ld a1, 24(a3) -; RV64I-NEXT: sll a4, a6, a7 -; RV64I-NEXT: andi a0, a7, 56 -; RV64I-NEXT: xori a3, a0, 63 -; RV64I-NEXT: srli a0, a5, 1 -; RV64I-NEXT: srl a0, a0, a3 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sll t1, a1, a7 -; RV64I-NEXT: srli a1, t0, 1 -; RV64I-NEXT: srl a1, a1, a3 -; RV64I-NEXT: or a1, t1, a1 -; RV64I-NEXT: sll t0, t0, a7 -; RV64I-NEXT: srli a6, a6, 1 -; RV64I-NEXT: srl a3, a6, a3 -; RV64I-NEXT: or a3, t0, a3 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: srli a6, t0, 56 -; RV64I-NEXT: srli a7, a3, 48 -; RV64I-NEXT: srli t0, a3, 40 -; RV64I-NEXT: srli t2, a3, 32 -; RV64I-NEXT: sb t2, 20(a2) -; RV64I-NEXT: sb t0, 21(a2) -; RV64I-NEXT: sb a7, 22(a2) -; RV64I-NEXT: sb a6, 23(a2) -; RV64I-NEXT: srli a6, t1, 56 -; RV64I-NEXT: srli a7, a1, 48 -; RV64I-NEXT: srli t0, a1, 40 -; RV64I-NEXT: srli t1, a1, 32 -; RV64I-NEXT: sb t1, 28(a2) -; RV64I-NEXT: sb t0, 29(a2) -; RV64I-NEXT: sb a7, 30(a2) -; RV64I-NEXT: sb a6, 31(a2) -; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a5, 56(sp) +; RV64I-NEXT: slli a0, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: sub a1, s6, a1 +; RV64I-NEXT: andi a3, a0, 56 +; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: ld a1, 24(a1) +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: sll a7, a5, a0 +; RV64I-NEXT: srli t0, a4, 1 +; RV64I-NEXT: sll t1, a1, a0 +; RV64I-NEXT: srli a1, a6, 1 +; RV64I-NEXT: sll t2, a6, a0 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: sll t3, a4, a0 +; RV64I-NEXT: srl a0, t0, a3 +; RV64I-NEXT: srl a4, a1, a3 +; RV64I-NEXT: srl a5, a5, a3 +; RV64I-NEXT: srli a3, t2, 56 +; RV64I-NEXT: srli a1, t1, 56 +; RV64I-NEXT: srli t0, t3, 56 +; RV64I-NEXT: srli t4, t3, 48 +; RV64I-NEXT: srli t5, t3, 40 +; RV64I-NEXT: srli t6, t3, 32 +; RV64I-NEXT: srli s0, t3, 24 +; RV64I-NEXT: srli s1, t3, 16 +; RV64I-NEXT: srli s2, t3, 8 +; RV64I-NEXT: srli a6, a7, 56 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a4, t1, a4 +; RV64I-NEXT: or a5, t2, a5 +; RV64I-NEXT: sb t6, 4(a2) +; RV64I-NEXT: sb t5, 5(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: sb t3, 0(a2) +; RV64I-NEXT: sb s2, 1(a2) +; RV64I-NEXT: sb s1, 2(a2) +; RV64I-NEXT: sb s0, 3(a2) ; RV64I-NEXT: srli a7, a5, 48 ; RV64I-NEXT: srli t0, a5, 40 ; RV64I-NEXT: srli t1, a5, 32 -; RV64I-NEXT: sb t1, 4(a2) -; RV64I-NEXT: sb t0, 5(a2) -; RV64I-NEXT: sb a7, 6(a2) -; RV64I-NEXT: sb a6, 7(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: srli a7, a5, 16 -; RV64I-NEXT: srli t0, a5, 8 -; RV64I-NEXT: sb a5, 0(a2) -; RV64I-NEXT: sb t0, 1(a2) -; RV64I-NEXT: sb a7, 2(a2) -; RV64I-NEXT: sb a6, 3(a2) -; RV64I-NEXT: srli a4, a4, 56 -; RV64I-NEXT: srli a5, a0, 48 -; RV64I-NEXT: srli a6, a0, 40 -; RV64I-NEXT: srli a7, a0, 32 -; RV64I-NEXT: sb a7, 12(a2) -; RV64I-NEXT: sb a6, 13(a2) -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: sb a4, 15(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 -; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: sb a4, 26(a2) -; RV64I-NEXT: sb a3, 27(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: srli t2, a5, 24 +; RV64I-NEXT: srli t3, a5, 16 +; RV64I-NEXT: srli t4, a5, 8 +; RV64I-NEXT: srli t5, a4, 48 +; RV64I-NEXT: srli t6, a4, 40 +; RV64I-NEXT: srli s0, a4, 32 +; RV64I-NEXT: srli s1, a4, 24 +; RV64I-NEXT: srli s2, a4, 16 +; RV64I-NEXT: srli s3, a4, 8 +; RV64I-NEXT: srli s4, a0, 48 +; RV64I-NEXT: srli s5, a0, 40 +; RV64I-NEXT: srli s6, a0, 32 +; RV64I-NEXT: sb t1, 20(a2) +; RV64I-NEXT: sb t0, 21(a2) +; RV64I-NEXT: sb a7, 22(a2) +; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb s0, 28(a2) +; RV64I-NEXT: sb t6, 29(a2) +; RV64I-NEXT: sb t5, 30(a2) +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb s6, 12(a2) +; RV64I-NEXT: sb s5, 13(a2) +; RV64I-NEXT: sb s4, 14(a2) +; RV64I-NEXT: sb a6, 15(a2) +; RV64I-NEXT: srli a6, a0, 8 +; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: sb t4, 17(a2) +; RV64I-NEXT: sb t3, 18(a2) +; RV64I-NEXT: sb t2, 19(a2) +; RV64I-NEXT: sb a4, 24(a2) +; RV64I-NEXT: sb s3, 25(a2) +; RV64I-NEXT: sb s2, 26(a2) +; RV64I-NEXT: sb s1, 27(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a6, 9(a2) +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -80 -; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu s0, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s6, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s7, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu t5, 20(a0) +; RV32I-NEXT: lbu t6, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) -; RV32I-NEXT: or a0, a0, t3 -; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: lbu s0, 25(a0) +; RV32I-NEXT: lbu s1, 26(a0) +; RV32I-NEXT: lbu s2, 27(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s3, t2 +; RV32I-NEXT: or t3, s8, s6 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: lbu s3, 28(a0) +; RV32I-NEXT: lbu s6, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s7, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t2, t4, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t3 -; RV32I-NEXT: or a1, a1, t2 -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) -; RV32I-NEXT: andi a0, a1, 28 -; RV32I-NEXT: addi a3, sp, 32 -; RV32I-NEXT: sub a7, a3, a0 -; RV32I-NEXT: lw t3, 0(a7) -; RV32I-NEXT: lw a6, 4(a7) -; RV32I-NEXT: slli t4, a1, 3 -; RV32I-NEXT: lw a5, 8(a7) -; RV32I-NEXT: lw t2, 12(a7) -; RV32I-NEXT: sll a0, a6, t4 -; RV32I-NEXT: andi a1, t4, 24 -; RV32I-NEXT: xori t5, a1, 31 -; RV32I-NEXT: srli a1, t3, 1 -; RV32I-NEXT: srl a1, a1, t5 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: sll a4, t2, t4 -; RV32I-NEXT: srli a3, a5, 1 -; RV32I-NEXT: srl a3, a3, t5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sll a5, a5, t4 -; RV32I-NEXT: srli a6, a6, 1 -; RV32I-NEXT: srl a6, a6, t5 -; RV32I-NEXT: lw t6, 16(a7) -; RV32I-NEXT: lw s0, 20(a7) -; RV32I-NEXT: or a6, a5, a6 -; RV32I-NEXT: lw s1, 24(a7) -; RV32I-NEXT: lw a7, 28(a7) -; RV32I-NEXT: sll t1, s0, t4 -; RV32I-NEXT: srli t0, t6, 1 -; RV32I-NEXT: srl t0, t0, t5 +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s2, s2, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s0, t1 +; RV32I-NEXT: or t1, s2, s1 +; RV32I-NEXT: or s0, s6, s3 +; RV32I-NEXT: or s1, s9, s8 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, t5 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t6, t6, t4 -; RV32I-NEXT: srli t2, t2, 1 -; RV32I-NEXT: srl t2, t2, t5 -; RV32I-NEXT: or t2, t6, t2 -; RV32I-NEXT: sll s2, a7, t4 -; RV32I-NEXT: srli a7, s1, 1 -; RV32I-NEXT: srl a7, a7, t5 -; RV32I-NEXT: or s3, s2, a7 -; RV32I-NEXT: sll s1, s1, t4 -; RV32I-NEXT: srli s0, s0, 1 -; RV32I-NEXT: srl a7, s0, t5 -; RV32I-NEXT: or t5, s1, a7 -; RV32I-NEXT: sll a7, t3, t4 -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: srli t3, t5, 16 -; RV32I-NEXT: srli t4, t5, 8 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: sb t4, 25(a2) -; RV32I-NEXT: sb t3, 26(a2) -; RV32I-NEXT: sb s1, 27(a2) -; RV32I-NEXT: srli t3, s2, 24 -; RV32I-NEXT: srli t4, s3, 16 -; RV32I-NEXT: srli t5, s3, 8 -; RV32I-NEXT: sb s3, 28(a2) -; RV32I-NEXT: sb t5, 29(a2) -; RV32I-NEXT: sb t4, 30(a2) -; RV32I-NEXT: sb t3, 31(a2) -; RV32I-NEXT: srli t3, t6, 24 -; RV32I-NEXT: srli t4, t2, 16 -; RV32I-NEXT: srli t5, t2, 8 -; RV32I-NEXT: sb t2, 16(a2) -; RV32I-NEXT: sb t5, 17(a2) -; RV32I-NEXT: sb t4, 18(a2) -; RV32I-NEXT: sb t3, 19(a2) -; RV32I-NEXT: srli t1, t1, 24 -; RV32I-NEXT: srli t2, t0, 16 -; RV32I-NEXT: srli t3, t0, 8 -; RV32I-NEXT: sb t0, 20(a2) -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: sb t1, 23(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: srli t0, a6, 16 -; RV32I-NEXT: srli t1, a6, 8 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: sb t1, 9(a2) -; RV32I-NEXT: sb t0, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a7, 24 -; RV32I-NEXT: srli a4, a7, 16 -; RV32I-NEXT: srli a5, a7, 8 -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 8 -; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 80 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw t2, 56(sp) +; RV32I-NEXT: sw a7, 60(sp) +; RV32I-NEXT: sw t0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw a4, 40(sp) +; RV32I-NEXT: sw a5, 44(sp) +; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: slli a3, a1, 3 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: sub a1, t6, a1 +; RV32I-NEXT: andi a0, a3, 24 +; RV32I-NEXT: xori a0, a0, 31 +; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw t0, 16(a1) +; RV32I-NEXT: lw t1, 20(a1) +; RV32I-NEXT: lw t2, 24(a1) +; RV32I-NEXT: lw a1, 28(a1) +; RV32I-NEXT: sll t3, a5, a3 +; RV32I-NEXT: srli t4, a4, 1 +; RV32I-NEXT: sll t5, a7, a3 +; RV32I-NEXT: srli t6, a6, 1 +; RV32I-NEXT: sll s0, a6, a3 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: sll s1, t1, a3 +; RV32I-NEXT: srli a6, t0, 1 +; RV32I-NEXT: sll s2, t0, a3 +; RV32I-NEXT: srli a7, a7, 1 +; RV32I-NEXT: sll s3, a1, a3 +; RV32I-NEXT: srli a1, t2, 1 +; RV32I-NEXT: sll s4, t2, a3 +; RV32I-NEXT: srli t0, t1, 1 +; RV32I-NEXT: sll s5, a4, a3 +; RV32I-NEXT: srl t2, t4, a0 +; RV32I-NEXT: srl t4, t6, a0 +; RV32I-NEXT: srl t6, a5, a0 +; RV32I-NEXT: srl s6, a6, a0 +; RV32I-NEXT: srl s7, a7, a0 +; RV32I-NEXT: srl s8, a1, a0 +; RV32I-NEXT: srl s9, t0, a0 +; RV32I-NEXT: srli t1, s4, 24 +; RV32I-NEXT: srli a7, s3, 24 +; RV32I-NEXT: srli a5, s2, 24 +; RV32I-NEXT: srli a3, s1, 24 +; RV32I-NEXT: srli a1, s0, 24 +; RV32I-NEXT: srli a0, t5, 24 +; RV32I-NEXT: srli s10, s5, 24 +; RV32I-NEXT: srli s11, s5, 16 +; RV32I-NEXT: srli ra, s5, 8 +; RV32I-NEXT: srli a4, t3, 24 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or t0, t5, t4 +; RV32I-NEXT: or t2, s0, t6 +; RV32I-NEXT: or t3, s1, s6 +; RV32I-NEXT: or t4, s2, s7 +; RV32I-NEXT: or t5, s3, s8 +; RV32I-NEXT: or t6, s4, s9 +; RV32I-NEXT: sb s5, 0(a2) +; RV32I-NEXT: sb ra, 1(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) +; RV32I-NEXT: srli s0, t6, 16 +; RV32I-NEXT: srli s1, t6, 8 +; RV32I-NEXT: srli s2, t5, 16 +; RV32I-NEXT: srli s3, t5, 8 +; RV32I-NEXT: srli s4, t4, 16 +; RV32I-NEXT: srli s5, t4, 8 +; RV32I-NEXT: srli s6, t3, 16 +; RV32I-NEXT: srli s7, t3, 8 +; RV32I-NEXT: srli s8, t2, 16 +; RV32I-NEXT: srli s9, t2, 8 +; RV32I-NEXT: srli s10, t0, 16 +; RV32I-NEXT: srli s11, t0, 8 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: sb s1, 25(a2) +; RV32I-NEXT: sb s0, 26(a2) +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t5, 28(a2) +; RV32I-NEXT: sb s3, 29(a2) +; RV32I-NEXT: sb s2, 30(a2) +; RV32I-NEXT: sb a7, 31(a2) +; RV32I-NEXT: srli a7, a6, 8 +; RV32I-NEXT: sb t4, 16(a2) +; RV32I-NEXT: sb s5, 17(a2) +; RV32I-NEXT: sb s4, 18(a2) +; RV32I-NEXT: sb a5, 19(a2) +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: sb s7, 21(a2) +; RV32I-NEXT: sb s6, 22(a2) +; RV32I-NEXT: sb a3, 23(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb s9, 9(a2) +; RV32I-NEXT: sb s8, 10(a2) +; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb a0, 15(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t1, 6(a2) +; RV32I-NEXT: sb a4, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -3577,381 +3765,431 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: addi s6, sp, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a5, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: slli a0, a1, 2 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: addi a3, sp, 32 -; RV64I-NEXT: sub a3, a3, a0 -; RV64I-NEXT: ld a5, 0(a3) -; RV64I-NEXT: ld a4, 8(a3) -; RV64I-NEXT: slli a6, a1, 5 -; RV64I-NEXT: ld a7, 16(a3) -; RV64I-NEXT: ld a1, 24(a3) -; RV64I-NEXT: sll a3, a4, a6 -; RV64I-NEXT: andi a0, a6, 32 -; RV64I-NEXT: xori t0, a0, 63 -; RV64I-NEXT: srli a0, a5, 1 -; RV64I-NEXT: srl a0, a0, t0 -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: sll t1, a1, a6 -; RV64I-NEXT: srli a1, a7, 1 -; RV64I-NEXT: srl a1, a1, t0 -; RV64I-NEXT: or a1, t1, a1 -; RV64I-NEXT: sll a7, a7, a6 -; RV64I-NEXT: srli a4, a4, 1 -; RV64I-NEXT: srl a4, a4, t0 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: sll a5, a5, a6 -; RV64I-NEXT: srli a6, a7, 56 -; RV64I-NEXT: srli t0, a7, 48 -; RV64I-NEXT: srli t2, a7, 40 -; RV64I-NEXT: srli a7, a7, 32 -; RV64I-NEXT: sb a7, 20(a2) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a5, 56(sp) +; RV64I-NEXT: slli a3, a1, 5 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: andi a0, a3, 32 +; RV64I-NEXT: sub a1, s6, a1 +; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: ld a1, 24(a1) +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: sll a0, a5, a3 +; RV64I-NEXT: srli t0, a4, 1 +; RV64I-NEXT: sll a1, a1, a3 +; RV64I-NEXT: srli t1, a6, 1 +; RV64I-NEXT: sll a6, a6, a3 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: sll a3, a4, a3 +; RV64I-NEXT: srl a4, t0, a7 +; RV64I-NEXT: srl t0, t1, a7 +; RV64I-NEXT: srl a5, a5, a7 +; RV64I-NEXT: srli a7, a6, 56 +; RV64I-NEXT: srli t1, a6, 48 +; RV64I-NEXT: srli t2, a6, 40 +; RV64I-NEXT: srli t3, a6, 32 +; RV64I-NEXT: srli t4, a1, 56 +; RV64I-NEXT: srli t5, a1, 48 +; RV64I-NEXT: srli t6, a1, 40 +; RV64I-NEXT: srli s0, a1, 32 +; RV64I-NEXT: srli s1, a3, 56 +; RV64I-NEXT: srli s2, a3, 48 +; RV64I-NEXT: srli s3, a3, 40 +; RV64I-NEXT: srli s4, a3, 32 +; RV64I-NEXT: srli s5, a3, 24 +; RV64I-NEXT: srli s6, a3, 16 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: srli t0, a3, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: srli a6, a0, 56 +; RV64I-NEXT: sb t3, 20(a2) ; RV64I-NEXT: sb t2, 21(a2) -; RV64I-NEXT: sb t0, 22(a2) -; RV64I-NEXT: sb a6, 23(a2) -; RV64I-NEXT: srli a6, t1, 56 -; RV64I-NEXT: srli a7, t1, 48 -; RV64I-NEXT: srli t0, t1, 40 -; RV64I-NEXT: srli t1, t1, 32 -; RV64I-NEXT: sb t1, 28(a2) -; RV64I-NEXT: sb t0, 29(a2) -; RV64I-NEXT: sb a7, 30(a2) -; RV64I-NEXT: sb a6, 31(a2) -; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: srli a7, a5, 48 -; RV64I-NEXT: srli t0, a5, 40 -; RV64I-NEXT: srli t1, a5, 32 -; RV64I-NEXT: sb t1, 4(a2) -; RV64I-NEXT: sb t0, 5(a2) -; RV64I-NEXT: sb a7, 6(a2) -; RV64I-NEXT: sb a6, 7(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: srli a7, a5, 16 -; RV64I-NEXT: srli t0, a5, 8 -; RV64I-NEXT: sb a5, 0(a2) +; RV64I-NEXT: sb t1, 22(a2) +; RV64I-NEXT: sb a7, 23(a2) +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: sb s0, 28(a2) +; RV64I-NEXT: sb t6, 29(a2) +; RV64I-NEXT: sb t5, 30(a2) +; RV64I-NEXT: sb t4, 31(a2) +; RV64I-NEXT: srli t1, a0, 40 +; RV64I-NEXT: or a4, a0, a4 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sb s4, 4(a2) +; RV64I-NEXT: sb s3, 5(a2) +; RV64I-NEXT: sb s2, 6(a2) +; RV64I-NEXT: sb s1, 7(a2) +; RV64I-NEXT: sb a3, 0(a2) ; RV64I-NEXT: sb t0, 1(a2) -; RV64I-NEXT: sb a7, 2(a2) -; RV64I-NEXT: sb a6, 3(a2) -; RV64I-NEXT: srli a5, a3, 56 -; RV64I-NEXT: srli a6, a3, 48 -; RV64I-NEXT: srli a7, a3, 40 -; RV64I-NEXT: srli a3, a3, 32 -; RV64I-NEXT: sb a3, 12(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: sb a6, 14(a2) -; RV64I-NEXT: sb a5, 15(a2) -; RV64I-NEXT: srli a3, a4, 24 -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: srli a6, a4, 8 -; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb s6, 2(a2) +; RV64I-NEXT: sb s5, 3(a2) +; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: sb t1, 13(a2) +; RV64I-NEXT: sb a7, 14(a2) +; RV64I-NEXT: sb a6, 15(a2) +; RV64I-NEXT: srli a0, a5, 24 +; RV64I-NEXT: srli a3, a5, 16 +; RV64I-NEXT: srli a6, a5, 8 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t0, a1, 16 +; RV64I-NEXT: srli t1, a1, 8 +; RV64I-NEXT: srli t2, a4, 24 +; RV64I-NEXT: srli t3, a4, 16 +; RV64I-NEXT: srli t4, a4, 8 +; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb a3, 18(a2) +; RV64I-NEXT: sb a0, 19(a2) ; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: sb a4, 26(a2) -; RV64I-NEXT: sb a3, 27(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 -; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb t1, 25(a2) +; RV64I-NEXT: sb t0, 26(a2) +; RV64I-NEXT: sb a7, 27(a2) +; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: sb t4, 9(a2) +; RV64I-NEXT: sb t3, 10(a2) +; RV64I-NEXT: sb t2, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu s1, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s4, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s10, 9(a0) +; RV32I-NEXT: lbu s11, 10(a0) +; RV32I-NEXT: lbu ra, 11(a0) +; RV32I-NEXT: lbu t4, 12(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a6, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t5, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu s0, 21(a0) +; RV32I-NEXT: lbu s2, 22(a0) +; RV32I-NEXT: lbu s3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s1, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: or s4, s6, s4 +; RV32I-NEXT: or s5, s10, s5 +; RV32I-NEXT: or s6, ra, s11 +; RV32I-NEXT: lbu s10, 28(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu ra, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: or a0, a0, t3 ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s7, s1 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or a0, a0, ra ; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: addi a0, sp, 32 -; RV32I-NEXT: sub a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s4, t1 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: or a5, a5, t4 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a0, a0, s0 +; RV32I-NEXT: sub t2, t6, a1 +; RV32I-NEXT: sw a3, 56(sp) +; RV32I-NEXT: sw a4, 60(sp) +; RV32I-NEXT: sw a6, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a7, 40(sp) +; RV32I-NEXT: sw t0, 44(sp) +; RV32I-NEXT: sw t1, 48(sp) +; RV32I-NEXT: sw a5, 52(sp) +; RV32I-NEXT: lw a6, 16(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a7, 24(t2) +; RV32I-NEXT: lw a1, 0(t2) +; RV32I-NEXT: lw a0, 4(t2) +; RV32I-NEXT: lw a4, 8(t2) +; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -3964,344 +4202,394 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes_dwordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t2, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t3, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t4, 18(a0) +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t2 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t2, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: sd a3, 32(sp) -; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a5, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t3, t1 +; RV64I-NEXT: addi t3, sp, 32 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a4, t6, a4 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t4, s8, s3 +; RV64I-NEXT: or t5, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 ; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: addi a0, sp, 32 -; RV64I-NEXT: sub a3, a0, a1 -; RV64I-NEXT: ld a4, 16(a3) -; RV64I-NEXT: ld a0, 8(a3) -; RV64I-NEXT: ld a1, 0(a3) -; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t1, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t5, t4 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: sub t2, t3, a1 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: sd a1, 32(sp) +; RV64I-NEXT: sd a5, 40(sp) +; RV64I-NEXT: sd a3, 48(sp) +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: ld a4, 16(t2) +; RV64I-NEXT: ld a0, 8(t2) +; RV64I-NEXT: ld a1, 0(t2) +; RV64I-NEXT: ld a3, 24(t2) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 ; RV64I-NEXT: srli t0, a4, 32 +; RV64I-NEXT: srli t1, a4, 24 +; RV64I-NEXT: srli t2, a4, 16 +; RV64I-NEXT: srli t3, a4, 8 +; RV64I-NEXT: srli t4, a3, 56 +; RV64I-NEXT: srli t5, a3, 48 +; RV64I-NEXT: srli t6, a3, 40 +; RV64I-NEXT: srli s0, a3, 32 +; RV64I-NEXT: srli s1, a3, 24 +; RV64I-NEXT: srli s2, a3, 16 +; RV64I-NEXT: srli s3, a3, 8 +; RV64I-NEXT: srli s4, a1, 56 +; RV64I-NEXT: srli s5, a1, 48 +; RV64I-NEXT: srli s6, a1, 40 +; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) ; RV64I-NEXT: sb a7, 21(a2) ; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a7, 17(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb a5, 19(a2) -; RV64I-NEXT: srli a4, a3, 56 -; RV64I-NEXT: srli a5, a3, 48 -; RV64I-NEXT: srli a6, a3, 40 -; RV64I-NEXT: srli a7, a3, 32 -; RV64I-NEXT: sb a7, 28(a2) -; RV64I-NEXT: sb a6, 29(a2) -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: sb t3, 17(a2) +; RV64I-NEXT: sb t2, 18(a2) +; RV64I-NEXT: sb t1, 19(a2) +; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: sb s0, 28(a2) +; RV64I-NEXT: sb t6, 29(a2) +; RV64I-NEXT: sb t5, 30(a2) +; RV64I-NEXT: sb t4, 31(a2) +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) -; RV64I-NEXT: sb a6, 25(a2) -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a4, 27(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb s3, 25(a2) +; RV64I-NEXT: sb s2, 26(a2) +; RV64I-NEXT: sb s1, 27(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb s7, 4(a2) +; RV64I-NEXT: sb s6, 5(a2) +; RV64I-NEXT: sb s5, 6(a2) +; RV64I-NEXT: sb s4, 7(a2) +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu s1, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s4, 6(a0) +; RV32I-NEXT: lbu s6, 7(a0) +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s10, 9(a0) +; RV32I-NEXT: lbu s11, 10(a0) +; RV32I-NEXT: lbu ra, 11(a0) +; RV32I-NEXT: lbu t4, 12(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a6, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t5, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu s0, 21(a0) +; RV32I-NEXT: lbu s2, 22(a0) +; RV32I-NEXT: lbu s3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s1, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: or s4, s6, s4 +; RV32I-NEXT: or s5, s10, s5 +; RV32I-NEXT: or s6, ra, s11 +; RV32I-NEXT: lbu s10, 28(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu ra, 30(a0) ; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: or a0, a0, t3 ; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw a7, 48(sp) -; RV32I-NEXT: sw t0, 52(sp) -; RV32I-NEXT: sw t1, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t4, t6, t4 +; RV32I-NEXT: addi t6, sp, 40 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s7, s1 +; RV32I-NEXT: or t5, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or a0, a0, ra ; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: addi a0, sp, 32 -; RV32I-NEXT: sub a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, s4, t1 +; RV32I-NEXT: or t1, s6, s5 +; RV32I-NEXT: or a5, a5, t4 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, t2, a4 +; RV32I-NEXT: or a6, t5, t3 +; RV32I-NEXT: or a0, a0, s0 +; RV32I-NEXT: sub t2, t6, a1 +; RV32I-NEXT: sw a3, 56(sp) +; RV32I-NEXT: sw a4, 60(sp) +; RV32I-NEXT: sw a6, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) +; RV32I-NEXT: sw a7, 40(sp) +; RV32I-NEXT: sw t0, 44(sp) +; RV32I-NEXT: sw t1, 48(sp) +; RV32I-NEXT: sw a5, 52(sp) +; RV32I-NEXT: lw a6, 16(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a7, 24(t2) +; RV32I-NEXT: lw a1, 0(t2) +; RV32I-NEXT: lw a0, 4(t2) +; RV32I-NEXT: lw a4, 8(t2) +; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 @@ -4314,430 +4602,474 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a7, a0, 32 -; RV64I-NEXT: lbu t0, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu t2, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: lbu t1, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t3, t1, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: sraiw a0, a0, 31 -; RV64I-NEXT: sd a0, 32(sp) -; RV64I-NEXT: sd a0, 40(sp) -; RV64I-NEXT: sd a0, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sraiw t1, t1, 31 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t3, t0 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: sd t1, 32(sp) +; RV64I-NEXT: sd t1, 40(sp) +; RV64I-NEXT: sd t1, 48(sp) +; RV64I-NEXT: sd t1, 56(sp) ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a6, 24(sp) -; RV64I-NEXT: andi a0, a1, 24 -; RV64I-NEXT: mv a3, sp -; RV64I-NEXT: add a0, a3, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: ld a4, 8(a0) -; RV64I-NEXT: slli a5, a1, 3 -; RV64I-NEXT: ld a6, 16(a0) -; RV64I-NEXT: ld a7, 24(a0) -; RV64I-NEXT: srl a0, a4, a5 -; RV64I-NEXT: andi a1, a5, 56 -; RV64I-NEXT: xori t0, a1, 63 -; RV64I-NEXT: slli a1, a6, 1 -; RV64I-NEXT: sll a1, a1, t0 -; RV64I-NEXT: or a1, a0, a1 -; RV64I-NEXT: srl a3, a3, a5 -; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: sll a4, a4, t0 -; RV64I-NEXT: or a4, a3, a4 -; RV64I-NEXT: srl a6, a6, a5 -; RV64I-NEXT: slli t1, a7, 1 -; RV64I-NEXT: sll t0, t1, t0 -; RV64I-NEXT: or t0, a6, t0 -; RV64I-NEXT: sra a5, a7, a5 -; RV64I-NEXT: srli a7, a5, 56 -; RV64I-NEXT: srli t1, a5, 48 -; RV64I-NEXT: srli t2, a5, 40 -; RV64I-NEXT: srli t3, a5, 32 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: slli a4, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: add a1, s6, a1 +; RV64I-NEXT: andi a0, a4, 56 +; RV64I-NEXT: ld a3, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 24(a1) +; RV64I-NEXT: srl a0, a5, a4 +; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a1, a3, a4 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a3, a6, a4 +; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: sra t0, t0, a4 +; RV64I-NEXT: sll a4, t1, a7 +; RV64I-NEXT: sll a5, a5, a7 +; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: srli a7, t0, 56 +; RV64I-NEXT: srli t1, t0, 48 +; RV64I-NEXT: srli t2, t0, 40 +; RV64I-NEXT: srli t3, t0, 32 +; RV64I-NEXT: srli t4, t0, 24 +; RV64I-NEXT: srli t5, t0, 16 +; RV64I-NEXT: srli t6, t0, 8 +; RV64I-NEXT: or a4, a0, a4 +; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: or a6, a3, a6 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) ; RV64I-NEXT: sb a7, 31(a2) -; RV64I-NEXT: srli a7, a5, 24 -; RV64I-NEXT: srli t1, a5, 16 -; RV64I-NEXT: srli t2, a5, 8 -; RV64I-NEXT: sb a5, 24(a2) -; RV64I-NEXT: sb t2, 25(a2) -; RV64I-NEXT: sb t1, 26(a2) -; RV64I-NEXT: sb a7, 27(a2) -; RV64I-NEXT: srli a5, t0, 56 -; RV64I-NEXT: srli a7, t0, 48 -; RV64I-NEXT: srli t1, t0, 40 -; RV64I-NEXT: srli t2, t0, 32 +; RV64I-NEXT: sb t0, 24(a2) +; RV64I-NEXT: sb t6, 25(a2) +; RV64I-NEXT: sb t5, 26(a2) +; RV64I-NEXT: sb t4, 27(a2) +; RV64I-NEXT: srli a7, a6, 56 +; RV64I-NEXT: srli t0, a6, 48 +; RV64I-NEXT: srli t1, a6, 40 +; RV64I-NEXT: srli t2, a6, 32 +; RV64I-NEXT: srli t3, a6, 24 +; RV64I-NEXT: srli t4, a6, 16 +; RV64I-NEXT: srli a6, a6, 8 +; RV64I-NEXT: srli t5, a5, 56 +; RV64I-NEXT: srli t6, a5, 48 +; RV64I-NEXT: srli s0, a5, 40 +; RV64I-NEXT: srli s1, a5, 32 +; RV64I-NEXT: srli s2, a5, 24 +; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli s4, a4, 56 +; RV64I-NEXT: srli s5, a4, 48 +; RV64I-NEXT: srli s6, a4, 40 ; RV64I-NEXT: sb t2, 20(a2) ; RV64I-NEXT: sb t1, 21(a2) -; RV64I-NEXT: sb a7, 22(a2) -; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, t0, 24 -; RV64I-NEXT: srli a7, t0, 16 -; RV64I-NEXT: srli t0, t0, 8 -; RV64I-NEXT: sb a6, 16(a2) -; RV64I-NEXT: sb t0, 17(a2) -; RV64I-NEXT: sb a7, 18(a2) -; RV64I-NEXT: sb a5, 19(a2) -; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: srli a7, a4, 40 -; RV64I-NEXT: srli t0, a4, 32 -; RV64I-NEXT: sb t0, 4(a2) -; RV64I-NEXT: sb a7, 5(a2) -; RV64I-NEXT: sb a6, 6(a2) -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a4, 24 +; RV64I-NEXT: sb t0, 22(a2) +; RV64I-NEXT: sb a7, 23(a2) +; RV64I-NEXT: srli a7, a4, 32 +; RV64I-NEXT: sb a3, 16(a2) +; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb t4, 18(a2) +; RV64I-NEXT: sb t3, 19(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb s1, 4(a2) +; RV64I-NEXT: sb s0, 5(a2) +; RV64I-NEXT: sb t6, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) ; RV64I-NEXT: srli a6, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a3, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a6, 2(a2) -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 12(a2) -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb s3, 2(a2) +; RV64I-NEXT: sb s2, 3(a2) +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb s6, 13(a2) +; RV64I-NEXT: sb s5, 14(a2) +; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb a6, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -80 -; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 64(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t6, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t3, 5(a0) +; RV32I-NEXT: lbu t4, 6(a0) +; RV32I-NEXT: lbu t5, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu s1, 9(a0) +; RV32I-NEXT: lbu s7, 10(a0) +; RV32I-NEXT: lbu s8, 11(a0) +; RV32I-NEXT: lbu s9, 12(a0) +; RV32I-NEXT: lbu s10, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s6, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu ra, 18(a0) +; RV32I-NEXT: lbu a3, 19(a0) +; RV32I-NEXT: lbu s2, 20(a0) +; RV32I-NEXT: lbu s3, 21(a0) +; RV32I-NEXT: lbu a7, 22(a0) +; RV32I-NEXT: lbu t0, 23(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t2, a0, t2 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: or t2, t2, t3 -; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or a4, a4, t6 +; RV32I-NEXT: sw a4, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t3, t1 +; RV32I-NEXT: or a6, t5, t4 +; RV32I-NEXT: lbu t1, 24(a0) +; RV32I-NEXT: lbu t5, 25(a0) +; RV32I-NEXT: lbu t6, 26(a0) +; RV32I-NEXT: lbu s0, 27(a0) +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t2, s1, t2 +; RV32I-NEXT: or t3, s8, s7 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: lbu s1, 28(a0) +; RV32I-NEXT: lbu s7, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s6, s4 +; RV32I-NEXT: or s4, s11, s5 +; RV32I-NEXT: or s5, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s6, 1(a1) +; RV32I-NEXT: lbu s10, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: addi s3, sp, 8 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 ; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s10, s10, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t3 -; RV32I-NEXT: or a1, a1, t4 -; RV32I-NEXT: srai a0, a0, 31 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: sw a0, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) -; RV32I-NEXT: sw a0, 40(sp) -; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: andi a0, a1, 28 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a6, a3, a0 -; RV32I-NEXT: lw a3, 0(a6) -; RV32I-NEXT: lw a4, 4(a6) +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t5, t1 +; RV32I-NEXT: or t1, s0, t6 +; RV32I-NEXT: or t5, s7, s1 +; RV32I-NEXT: or t6, s9, s8 +; RV32I-NEXT: or a3, s6, a3 +; RV32I-NEXT: or a1, a1, s10 +; RV32I-NEXT: srai s0, s9, 31 +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t3, t2 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or a7, a7, s2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s0, 60(sp) +; RV32I-NEXT: sw s0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s0, 44(sp) +; RV32I-NEXT: sw s0, 48(sp) +; RV32I-NEXT: sw s0, 52(sp) +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw a7, 28(sp) +; RV32I-NEXT: sw t0, 32(sp) +; RV32I-NEXT: sw t1, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: slli t1, a1, 3 -; RV32I-NEXT: lw a7, 8(a6) -; RV32I-NEXT: lw t0, 12(a6) +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: add a1, s3, a1 +; RV32I-NEXT: andi a0, t1, 24 +; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw a7, 16(a1) +; RV32I-NEXT: lw t2, 20(a1) +; RV32I-NEXT: lw t3, 24(a1) +; RV32I-NEXT: lw t4, 28(a1) ; RV32I-NEXT: srl a0, a4, t1 -; RV32I-NEXT: andi a1, t1, 24 -; RV32I-NEXT: xori t2, a1, 31 -; RV32I-NEXT: slli a1, a7, 1 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: srl a3, a3, t1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: sll a4, a4, t2 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: srl a5, t0, t1 -; RV32I-NEXT: lw t3, 16(a6) -; RV32I-NEXT: lw t4, 20(a6) -; RV32I-NEXT: lw t5, 24(a6) -; RV32I-NEXT: lw t6, 28(a6) -; RV32I-NEXT: slli a6, t3, 1 -; RV32I-NEXT: sll a6, a6, t2 -; RV32I-NEXT: or a6, a5, a6 -; RV32I-NEXT: srl a7, a7, t1 -; RV32I-NEXT: slli t0, t0, 1 -; RV32I-NEXT: sll t0, t0, t2 -; RV32I-NEXT: or t0, a7, t0 -; RV32I-NEXT: srl s0, t4, t1 -; RV32I-NEXT: slli s1, t5, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s1, s0, s1 -; RV32I-NEXT: srl t3, t3, t1 -; RV32I-NEXT: slli t4, t4, 1 -; RV32I-NEXT: sll t4, t4, t2 -; RV32I-NEXT: or t4, t3, t4 -; RV32I-NEXT: srl t5, t5, t1 -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t2, s2, t2 -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sra t1, t6, t1 -; RV32I-NEXT: srli t6, t1, 24 -; RV32I-NEXT: srli s2, t1, 16 -; RV32I-NEXT: srli s3, t1, 8 +; RV32I-NEXT: slli t5, a5, 1 +; RV32I-NEXT: srl a1, a3, t1 +; RV32I-NEXT: slli t6, a4, 1 +; RV32I-NEXT: srl a3, a6, t1 +; RV32I-NEXT: slli s0, a7, 1 +; RV32I-NEXT: srl a4, a5, t1 +; RV32I-NEXT: slli s1, a6, 1 +; RV32I-NEXT: srl a5, t2, t1 +; RV32I-NEXT: slli s2, t3, 1 +; RV32I-NEXT: srl a6, a7, t1 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: srl a7, t3, t1 +; RV32I-NEXT: slli t3, t4, 1 +; RV32I-NEXT: sra t1, t4, t1 +; RV32I-NEXT: sll t4, t5, t0 +; RV32I-NEXT: sll t5, t6, t0 +; RV32I-NEXT: sll t6, s0, t0 +; RV32I-NEXT: sll s0, s1, t0 +; RV32I-NEXT: sll s1, s2, t0 +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: sll t3, t3, t0 +; RV32I-NEXT: srli s2, t1, 24 +; RV32I-NEXT: srli s3, t1, 16 +; RV32I-NEXT: srli s4, t1, 8 +; RV32I-NEXT: or t0, a0, t4 +; RV32I-NEXT: or t4, a1, t5 +; RV32I-NEXT: or t5, a3, t6 +; RV32I-NEXT: or s0, a4, s0 +; RV32I-NEXT: or s1, a5, s1 +; RV32I-NEXT: or t2, a6, t2 +; RV32I-NEXT: or t3, a7, t3 ; RV32I-NEXT: sb t1, 28(a2) -; RV32I-NEXT: sb s3, 29(a2) -; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb t6, 31(a2) -; RV32I-NEXT: srli t1, t2, 24 -; RV32I-NEXT: srli t6, t2, 16 +; RV32I-NEXT: sb s4, 29(a2) +; RV32I-NEXT: sb s3, 30(a2) +; RV32I-NEXT: sb s2, 31(a2) +; RV32I-NEXT: srli t1, t3, 24 +; RV32I-NEXT: srli t6, t3, 16 +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: srli s2, t2, 24 +; RV32I-NEXT: srli s3, t2, 16 ; RV32I-NEXT: srli t2, t2, 8 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli s4, s1, 24 +; RV32I-NEXT: srli s5, s1, 16 +; RV32I-NEXT: srli s1, s1, 8 +; RV32I-NEXT: srli s6, s0, 24 +; RV32I-NEXT: srli s7, s0, 16 +; RV32I-NEXT: srli s0, s0, 8 +; RV32I-NEXT: srli s8, t5, 24 +; RV32I-NEXT: srli s9, t5, 16 +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: srli s10, t4, 24 +; RV32I-NEXT: srli s11, t4, 16 +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t6, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli t1, t4, 24 -; RV32I-NEXT: srli t2, t4, 16 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t3, 16(a2) -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: sb t2, 18(a2) -; RV32I-NEXT: sb t1, 19(a2) -; RV32I-NEXT: srli t1, s1, 24 -; RV32I-NEXT: srli t2, s1, 16 -; RV32I-NEXT: srli s1, s1, 8 -; RV32I-NEXT: sb s0, 20(a2) -; RV32I-NEXT: sb s1, 21(a2) -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: sb t1, 23(a2) -; RV32I-NEXT: srli t1, t0, 24 -; RV32I-NEXT: srli t2, t0, 16 +; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb t2, 17(a2) +; RV32I-NEXT: sb s3, 18(a2) +; RV32I-NEXT: sb s2, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: srli t0, t0, 8 -; RV32I-NEXT: sb a7, 8(a2) -; RV32I-NEXT: sb t0, 9(a2) -; RV32I-NEXT: sb t2, 10(a2) -; RV32I-NEXT: sb t1, 11(a2) -; RV32I-NEXT: srli a7, a6, 24 -; RV32I-NEXT: srli t0, a6, 16 -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a5, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb t0, 14(a2) -; RV32I-NEXT: sb a7, 15(a2) -; RV32I-NEXT: srli a5, a4, 24 -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a6, 2(a2) -; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: sb s5, 22(a2) +; RV32I-NEXT: sb s4, 23(a2) +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s0, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb t5, 13(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb t4, 1(a2) +; RV32I-NEXT: sb s11, 2(a2) +; RV32I-NEXT: sb s10, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: sb a4, 6(a2) -; RV32I-NEXT: sb a3, 7(a2) -; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 64(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 80 +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -4750,383 +5082,433 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a7, a0, 32 -; RV64I-NEXT: lbu t0, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu t2, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: lbu t1, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t3, t1, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: sraiw a0, a0, 31 -; RV64I-NEXT: sd a0, 32(sp) -; RV64I-NEXT: sd a0, 40(sp) -; RV64I-NEXT: sd a0, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sraiw t1, t1, 31 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t3, t0 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: sd t1, 32(sp) +; RV64I-NEXT: sd t1, 40(sp) +; RV64I-NEXT: sd t1, 48(sp) +; RV64I-NEXT: sd t1, 56(sp) ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a6, 24(sp) -; RV64I-NEXT: slli a0, a1, 2 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: mv a3, sp -; RV64I-NEXT: add a0, a3, a0 -; RV64I-NEXT: ld a4, 0(a0) -; RV64I-NEXT: ld a5, 8(a0) -; RV64I-NEXT: slli a6, a1, 5 -; RV64I-NEXT: ld a7, 16(a0) -; RV64I-NEXT: ld t0, 24(a0) -; RV64I-NEXT: srl a3, a5, a6 -; RV64I-NEXT: andi a0, a6, 32 -; RV64I-NEXT: xori t1, a0, 63 -; RV64I-NEXT: slli a0, a7, 1 -; RV64I-NEXT: sll a0, a0, t1 -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: srl t2, a4, a6 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: slli a3, a1, 5 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: andi a0, a3, 32 +; RV64I-NEXT: add a1, s6, a1 +; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 24(a1) +; RV64I-NEXT: srl a0, a5, a3 +; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a1, a4, a3 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: sll a1, a5, t1 -; RV64I-NEXT: or a1, t2, a1 -; RV64I-NEXT: srl a5, a7, a6 -; RV64I-NEXT: slli a4, t0, 1 -; RV64I-NEXT: sll a4, a4, t1 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: sra a6, t0, a6 -; RV64I-NEXT: srli a7, a5, 24 -; RV64I-NEXT: srli t0, a5, 16 -; RV64I-NEXT: srli t1, a5, 8 -; RV64I-NEXT: sb a5, 16(a2) -; RV64I-NEXT: sb t1, 17(a2) -; RV64I-NEXT: sb t0, 18(a2) +; RV64I-NEXT: srl a4, a6, a3 +; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: sra a3, t0, a3 +; RV64I-NEXT: sll t0, t1, a7 +; RV64I-NEXT: sll a5, a5, a7 +; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: srli a7, a4, 24 +; RV64I-NEXT: srli t1, a4, 16 +; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: srli t3, a3, 56 +; RV64I-NEXT: srli t4, a3, 48 +; RV64I-NEXT: srli t5, a3, 40 +; RV64I-NEXT: srli t6, a3, 32 +; RV64I-NEXT: srli s0, a3, 24 +; RV64I-NEXT: srli s1, a3, 16 +; RV64I-NEXT: srli s2, a3, 8 +; RV64I-NEXT: srli s3, a1, 24 +; RV64I-NEXT: srli s4, a1, 16 +; RV64I-NEXT: srli s5, a1, 8 +; RV64I-NEXT: srli s6, a0, 24 +; RV64I-NEXT: or a6, a4, a6 +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb t2, 17(a2) +; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a5, a6, 56 -; RV64I-NEXT: srli a7, a6, 48 -; RV64I-NEXT: srli t0, a6, 40 -; RV64I-NEXT: srli t1, a6, 32 -; RV64I-NEXT: sb t1, 28(a2) -; RV64I-NEXT: sb t0, 29(a2) -; RV64I-NEXT: sb a7, 30(a2) -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a6, 24 -; RV64I-NEXT: srli a7, a6, 16 -; RV64I-NEXT: srli t0, a6, 8 -; RV64I-NEXT: sb a6, 24(a2) -; RV64I-NEXT: sb t0, 25(a2) -; RV64I-NEXT: sb a7, 26(a2) -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, t2, 24 -; RV64I-NEXT: srli a6, t2, 16 -; RV64I-NEXT: srli a7, t2, 8 -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb a7, 1(a2) -; RV64I-NEXT: sb a6, 2(a2) -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a5, a3, 24 -; RV64I-NEXT: srli a6, a3, 16 -; RV64I-NEXT: srli a7, a3, 8 -; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: sb t6, 28(a2) +; RV64I-NEXT: sb t5, 29(a2) +; RV64I-NEXT: sb t4, 30(a2) +; RV64I-NEXT: sb t3, 31(a2) +; RV64I-NEXT: srli a7, a0, 8 +; RV64I-NEXT: or t0, a0, t0 +; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb s2, 25(a2) +; RV64I-NEXT: sb s1, 26(a2) +; RV64I-NEXT: sb s0, 27(a2) +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s5, 1(a2) +; RV64I-NEXT: sb s4, 2(a2) +; RV64I-NEXT: sb s3, 3(a2) +; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a3, a4, 56 -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: srli a6, a4, 40 -; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb s6, 11(a2) +; RV64I-NEXT: srli a0, a6, 56 +; RV64I-NEXT: srli a1, a6, 48 +; RV64I-NEXT: srli a3, a6, 40 +; RV64I-NEXT: srli a4, a6, 32 +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: srli a7, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli a5, a5, 32 +; RV64I-NEXT: srli t2, t0, 56 +; RV64I-NEXT: srli t3, t0, 48 +; RV64I-NEXT: srli t4, t0, 40 +; RV64I-NEXT: srli t0, t0, 32 ; RV64I-NEXT: sb a4, 20(a2) -; RV64I-NEXT: sb a6, 21(a2) -; RV64I-NEXT: sb a5, 22(a2) -; RV64I-NEXT: sb a3, 23(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a0, a0, 32 -; RV64I-NEXT: sb a0, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a3, 21(a2) +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb t1, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: sb t0, 12(a2) +; RV64I-NEXT: sb t4, 13(a2) +; RV64I-NEXT: sb t3, 14(a2) +; RV64I-NEXT: sb t2, 15(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu t6, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s3, 6(a0) +; RV32I-NEXT: lbu s5, 7(a0) +; RV32I-NEXT: lbu s4, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a7, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t4, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or t0, t6, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu ra, 27(a0) +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s3, s5, s3 +; RV32I-NEXT: or s4, s9, s4 +; RV32I-NEXT: or s5, s11, s10 +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu s10, 29(a0) +; RV32I-NEXT: lbu s11, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: or s2, s6, s2 +; RV32I-NEXT: addi s6, sp, 8 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t2, a0, t2 -; RV32I-NEXT: or t2, t2, t3 -; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t2, s7, t6 +; RV32I-NEXT: or t3, ra, s8 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: or t5, a0, s11 ; RV32I-NEXT: srai a0, a0, 31 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or t0, s3, t1 +; RV32I-NEXT: or t1, s5, s4 +; RV32I-NEXT: or a5, a5, s2 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or a7, t3, t2 +; RV32I-NEXT: or t2, t5, t4 ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: andi a1, a1, 28 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: add s6, s6, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw a7, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: sw a6, 8(sp) +; RV32I-NEXT: sw t0, 12(sp) +; RV32I-NEXT: sw t1, 16(sp) +; RV32I-NEXT: sw a5, 20(sp) +; RV32I-NEXT: lw a6, 16(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a7, 24(s6) +; RV32I-NEXT: lw a1, 0(s6) +; RV32I-NEXT: lw a0, 4(s6) +; RV32I-NEXT: lw a4, 8(s6) +; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %wordOff = load i256, ptr %wordOff.ptr, align 1 @@ -5139,346 +5521,396 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes_dwordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t1, 2(a0) +; RV64I-NEXT: lbu s3, 3(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu s8, 5(a0) +; RV64I-NEXT: lbu s9, 6(a0) +; RV64I-NEXT: lbu s10, 7(a0) +; RV64I-NEXT: lbu s2, 8(a0) +; RV64I-NEXT: lbu s4, 9(a0) +; RV64I-NEXT: lbu s5, 10(a0) +; RV64I-NEXT: lbu s6, 11(a0) +; RV64I-NEXT: lbu s7, 12(a0) +; RV64I-NEXT: lbu s11, 13(a0) +; RV64I-NEXT: lbu t4, 14(a0) +; RV64I-NEXT: lbu t5, 15(a0) +; RV64I-NEXT: lbu a3, 16(a0) +; RV64I-NEXT: lbu a6, 17(a0) +; RV64I-NEXT: lbu t2, 18(a0) +; RV64I-NEXT: lbu t3, 19(a0) +; RV64I-NEXT: lbu a4, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s0, 22(a0) +; RV64I-NEXT: lbu s1, 23(a0) ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, s3, t1 +; RV64I-NEXT: or t0, s8, t0 +; RV64I-NEXT: or t1, s10, s9 +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s8, 25(a0) +; RV64I-NEXT: lbu s9, 26(a0) +; RV64I-NEXT: lbu s10, 27(a0) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: or s2, s4, s2 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: or s5, s11, s7 +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: lbu s7, 29(a0) +; RV64I-NEXT: lbu s11, 30(a0) ; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a7, a0, 32 -; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t5, t5, 24 +; RV64I-NEXT: or t4, t5, t4 +; RV64I-NEXT: mv t5, sp +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: slli s1, s1, 24 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a6, t3, t2 +; RV64I-NEXT: or a4, t6, a4 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t2, s8, s3 +; RV64I-NEXT: or t3, s10, s9 +; RV64I-NEXT: or t6, s7, s6 +; RV64I-NEXT: or a0, a0, s11 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or t0, s4, s2 +; RV64I-NEXT: or t1, t4, s5 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, s0, a4 +; RV64I-NEXT: or a6, t3, t2 +; RV64I-NEXT: or a0, a0, t6 +; RV64I-NEXT: add t5, t5, a1 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: sd a0, 32(sp) ; RV64I-NEXT: sd a0, 40(sp) ; RV64I-NEXT: sd a0, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: sd a3, 0(sp) -; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a6, 24(sp) -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: andi a1, a1, 24 -; RV64I-NEXT: mv a0, sp -; RV64I-NEXT: add a3, a0, a1 -; RV64I-NEXT: ld a4, 16(a3) -; RV64I-NEXT: ld a0, 8(a3) -; RV64I-NEXT: ld a1, 0(a3) -; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sd a5, 0(sp) +; RV64I-NEXT: sd a7, 8(sp) +; RV64I-NEXT: sd a3, 16(sp) +; RV64I-NEXT: sd a1, 24(sp) +; RV64I-NEXT: ld a4, 16(t5) +; RV64I-NEXT: ld a0, 8(t5) +; RV64I-NEXT: ld a1, 0(t5) +; RV64I-NEXT: ld a3, 24(t5) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 ; RV64I-NEXT: srli a7, a4, 40 ; RV64I-NEXT: srli t0, a4, 32 +; RV64I-NEXT: srli t1, a4, 24 +; RV64I-NEXT: srli t2, a4, 16 +; RV64I-NEXT: srli t3, a4, 8 +; RV64I-NEXT: srli t4, a3, 56 +; RV64I-NEXT: srli t5, a3, 48 +; RV64I-NEXT: srli t6, a3, 40 +; RV64I-NEXT: srli s0, a3, 32 +; RV64I-NEXT: srli s1, a3, 24 +; RV64I-NEXT: srli s2, a3, 16 +; RV64I-NEXT: srli s3, a3, 8 +; RV64I-NEXT: srli s4, a1, 56 +; RV64I-NEXT: srli s5, a1, 48 +; RV64I-NEXT: srli s6, a1, 40 +; RV64I-NEXT: srli s7, a1, 32 +; RV64I-NEXT: srli s8, a1, 24 +; RV64I-NEXT: srli s9, a1, 16 +; RV64I-NEXT: srli s10, a1, 8 +; RV64I-NEXT: srli s11, a0, 56 ; RV64I-NEXT: sb t0, 20(a2) ; RV64I-NEXT: sb a7, 21(a2) ; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: sb a5, 23(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 +; RV64I-NEXT: srli a5, a0, 48 ; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a7, 17(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb a5, 19(a2) -; RV64I-NEXT: srli a4, a3, 56 -; RV64I-NEXT: srli a5, a3, 48 -; RV64I-NEXT: srli a6, a3, 40 -; RV64I-NEXT: srli a7, a3, 32 -; RV64I-NEXT: sb a7, 28(a2) -; RV64I-NEXT: sb a6, 29(a2) -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: sb t3, 17(a2) +; RV64I-NEXT: sb t2, 18(a2) +; RV64I-NEXT: sb t1, 19(a2) +; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: sb s0, 28(a2) +; RV64I-NEXT: sb t6, 29(a2) +; RV64I-NEXT: sb t5, 30(a2) +; RV64I-NEXT: sb t4, 31(a2) +; RV64I-NEXT: srli a6, a0, 32 ; RV64I-NEXT: sb a3, 24(a2) -; RV64I-NEXT: sb a6, 25(a2) -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a4, 27(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb s3, 25(a2) +; RV64I-NEXT: sb s2, 26(a2) +; RV64I-NEXT: sb s1, 27(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb s7, 4(a2) +; RV64I-NEXT: sb s6, 5(a2) +; RV64I-NEXT: sb s5, 6(a2) +; RV64I-NEXT: sb s4, 7(a2) +; RV64I-NEXT: srli a7, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) +; RV64I-NEXT: sb s10, 1(a2) +; RV64I-NEXT: sb s9, 2(a2) +; RV64I-NEXT: sb s8, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb s11, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a7, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu t0, 1(a0) +; RV32I-NEXT: lbu t1, 2(a0) +; RV32I-NEXT: lbu t6, 3(a0) +; RV32I-NEXT: lbu s7, 4(a0) +; RV32I-NEXT: lbu s8, 5(a0) +; RV32I-NEXT: lbu s3, 6(a0) +; RV32I-NEXT: lbu s5, 7(a0) +; RV32I-NEXT: lbu s4, 8(a0) +; RV32I-NEXT: lbu s9, 9(a0) +; RV32I-NEXT: lbu s10, 10(a0) +; RV32I-NEXT: lbu s11, 11(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: lbu s6, 13(a0) +; RV32I-NEXT: lbu a5, 14(a0) +; RV32I-NEXT: lbu a7, 15(a0) +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu t2, 17(a0) +; RV32I-NEXT: lbu t3, 18(a0) +; RV32I-NEXT: lbu t4, 19(a0) +; RV32I-NEXT: lbu a4, 20(a0) +; RV32I-NEXT: lbu t5, 21(a0) +; RV32I-NEXT: lbu s0, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or t0, t6, t1 +; RV32I-NEXT: or t1, s8, s7 +; RV32I-NEXT: lbu t6, 24(a0) +; RV32I-NEXT: lbu s7, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu ra, 27(a0) +; RV32I-NEXT: slli s3, s3, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s3, s5, s3 +; RV32I-NEXT: or s4, s9, s4 +; RV32I-NEXT: or s5, s11, s10 +; RV32I-NEXT: lbu s9, 28(a0) +; RV32I-NEXT: lbu s10, 29(a0) +; RV32I-NEXT: lbu s11, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: or s2, s6, s2 +; RV32I-NEXT: addi s6, sp, 8 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: lbu t0, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t3, t0 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: lbu t1, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t1, t4, t1 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: lbu t2, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t2, a0, t2 -; RV32I-NEXT: or t2, t2, t3 -; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a3, t2, a3 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or a4, t5, a4 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or t2, s7, t6 +; RV32I-NEXT: or t3, ra, s8 +; RV32I-NEXT: or t4, s10, s9 +; RV32I-NEXT: or t5, a0, s11 ; RV32I-NEXT: srai a0, a0, 31 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or t0, s3, t1 +; RV32I-NEXT: or t1, s5, s4 +; RV32I-NEXT: or a5, a5, s2 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or a7, t3, t2 +; RV32I-NEXT: or t2, t5, t4 ; RV32I-NEXT: sw a0, 56(sp) ; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 64(sp) +; RV32I-NEXT: sw a0, 68(sp) ; RV32I-NEXT: sw a0, 40(sp) ; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw a7, 16(sp) -; RV32I-NEXT: sw t0, 20(sp) -; RV32I-NEXT: sw t1, 24(sp) -; RV32I-NEXT: sw t2, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: andi a1, a1, 24 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a4, a0, a1 -; RV32I-NEXT: lw a5, 16(a4) -; RV32I-NEXT: lw a6, 20(a4) -; RV32I-NEXT: lw a7, 24(a4) -; RV32I-NEXT: lw a1, 0(a4) -; RV32I-NEXT: lw a0, 4(a4) -; RV32I-NEXT: lw t0, 8(a4) -; RV32I-NEXT: lw a3, 12(a4) -; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: add s6, s6, a1 +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a4, 28(sp) +; RV32I-NEXT: sw a7, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: sw a6, 8(sp) +; RV32I-NEXT: sw t0, 12(sp) +; RV32I-NEXT: sw t1, 16(sp) +; RV32I-NEXT: sw a5, 20(sp) +; RV32I-NEXT: lw a6, 16(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a7, 24(s6) +; RV32I-NEXT: lw a1, 0(s6) +; RV32I-NEXT: lw a0, 4(s6) +; RV32I-NEXT: lw a4, 8(s6) +; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 ; RV32I-NEXT: srli t3, a7, 8 +; RV32I-NEXT: srli t4, t0, 24 +; RV32I-NEXT: srli t5, t0, 16 +; RV32I-NEXT: srli t6, t0, 8 +; RV32I-NEXT: srli s0, a6, 24 +; RV32I-NEXT: srli s1, a6, 16 +; RV32I-NEXT: srli s2, a6, 8 +; RV32I-NEXT: srli s3, a5, 24 +; RV32I-NEXT: srli s4, a5, 16 +; RV32I-NEXT: srli s5, a5, 8 +; RV32I-NEXT: srli s6, a4, 24 +; RV32I-NEXT: srli s7, a4, 16 +; RV32I-NEXT: srli s8, a4, 8 +; RV32I-NEXT: srli s9, a3, 24 +; RV32I-NEXT: srli s10, a3, 16 +; RV32I-NEXT: srli s11, a3, 8 +; RV32I-NEXT: srli ra, a1, 24 ; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t3, 25(a2) ; RV32I-NEXT: sb t2, 26(a2) ; RV32I-NEXT: sb t1, 27(a2) -; RV32I-NEXT: srli a7, a4, 24 -; RV32I-NEXT: srli t1, a4, 16 -; RV32I-NEXT: srli t2, a4, 8 -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb t2, 29(a2) -; RV32I-NEXT: sb t1, 30(a2) -; RV32I-NEXT: sb a7, 31(a2) -; RV32I-NEXT: srli a4, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t1, a5, 8 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: sb a4, 19(a2) -; RV32I-NEXT: srli a4, a6, 24 -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: srli a7, a6, 8 -; RV32I-NEXT: sb a6, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a5, 22(a2) -; RV32I-NEXT: sb a4, 23(a2) -; RV32I-NEXT: srli a4, t0, 24 -; RV32I-NEXT: srli a5, t0, 16 -; RV32I-NEXT: srli a6, t0, 8 -; RV32I-NEXT: sb t0, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: sb t0, 28(a2) +; RV32I-NEXT: sb t6, 29(a2) +; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sb t4, 31(a2) +; RV32I-NEXT: srli t0, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb s2, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb s0, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb s5, 21(a2) +; RV32I-NEXT: sb s4, 22(a2) +; RV32I-NEXT: sb s3, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb s8, 9(a2) +; RV32I-NEXT: sb s7, 10(a2) +; RV32I-NEXT: sb s6, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s11, 13(a2) +; RV32I-NEXT: sb s10, 14(a2) +; RV32I-NEXT: sb s9, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb ra, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index d36c660b3b142..b2c130c2d7c10 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -11,8 +11,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lb a0, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 @@ -29,25 +29,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lb a0, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 @@ -90,25 +90,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -133,8 +133,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lb a0, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 @@ -151,25 +151,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -189,93 +189,93 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 2(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: lbu a4, 6(a0) +; RV32I-NEXT: lbu a5, 7(a0) +; RV32I-NEXT: lbu a6, 4(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: or a5, a1, a5 -; RV32I-NEXT: addi a4, a5, -32 -; RV32I-NEXT: srl a1, a3, a5 -; RV32I-NEXT: bltz a4, .LBB3_2 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: srl a1, a5, a4 +; RV32I-NEXT: bltz a3, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB3_3 @@ -289,29 +289,29 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: not a7, a4 +; RV32I-NEXT: slli a5, a5, 1 ; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: srl a0, a0, a5 -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a3, a3, a5 -; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: sll a4, a5, a7 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: .LBB3_3: -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: and a1, a4, a1 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: srli a4, a0, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: and a1, a3, a1 ; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 24 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: sb a5, 5(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a1, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %bitOff = load i64, ptr %bitOff.ptr, align 1 @@ -322,93 +322,93 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 2(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: or a5, a1, a5 -; RV32I-NEXT: addi a4, a5, -32 -; RV32I-NEXT: sll a1, a3, a5 -; RV32I-NEXT: bltz a4, .LBB4_2 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: sll a1, a5, a4 +; RV32I-NEXT: bltz a3, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB4_3 @@ -422,29 +422,29 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: not a7, a4 +; RV32I-NEXT: srli a5, a5, 1 ; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: sll a0, a0, a5 -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: srl a3, a3, a5 -; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: srl a4, a5, a7 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: .LBB4_3: -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: and a1, a4, a1 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: srli a4, a0, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: and a1, a3, a1 ; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: srli a4, a1, 24 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli a7, a1, 8 ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb a7, 1(a2) ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: sb a4, 3(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb a4, 6(a2) +; RV32I-NEXT: sb a5, 7(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %bitOff = load i64, ptr %bitOff.ptr, align 1 @@ -455,66 +455,66 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a5, 1(a1) -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 2(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 5(a1) -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a4, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 48 ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: srli a5, a0, 40 +; RV64I-NEXT: srli a6, a0, 16 +; RV64I-NEXT: srli a7, a0, 24 +; RV64I-NEXT: srli t0, a0, 8 ; RV64I-NEXT: sb a4, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: srli a3, a0, 24 -; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: sb t0, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_8bytes: @@ -524,60 +524,60 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) ; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 1(a1) ; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a4, a6, 24 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a1, a6 -; RV32I-NEXT: addi a6, a5, -32 -; RV32I-NEXT: sra a1, a3, a5 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a4, a5, 16 +; RV32I-NEXT: slli a5, a6, 24 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: or a3, a1, a7 +; RV32I-NEXT: addi a6, a3, -32 +; RV32I-NEXT: sra a1, a4, a3 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a4, a4, 31 +; RV32I-NEXT: srai a5, a5, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 1(a0) ; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: or a0, a0, a4 -; RV32I-NEXT: srl a0, a0, a5 -; RV32I-NEXT: not a4, a5 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a3, a3, a4 +; RV32I-NEXT: not a6, a3 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: sll a3, a4, a6 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: .LBB5_3: ; RV32I-NEXT: srli a3, a1, 16 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: srli a6, a0, 16 +; RV32I-NEXT: srli a7, a0, 24 ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: sb a5, 5(a2) ; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a1, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: sb a6, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) ; RV32I-NEXT: ret %src = load i64, ptr %src.ptr, align 1 %bitOff = load i64, ptr %bitOff.ptr, align 1 @@ -589,231 +589,231 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 2(a1) -; RV64I-NEXT: lbu a7, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a5, a1, a4 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: srl a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB6_2 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a1, a6 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: srl a0, a0, a4 +; RV64I-NEXT: sll a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB6_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 4(a2) +; RV64I-NEXT: sb a6, 5(a2) +; RV64I-NEXT: sb a5, 6(a2) +; RV64I-NEXT: sb a4, 7(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: sb t1, 2(a2) +; RV64I-NEXT: sb t0, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t4, 1(a1) +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t2, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t1 ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: mv t2, sp +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a0, 12(sp) ; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: add a0, t2, a0 ; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a5, 4(a0) +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a6, a3, a1 -; RV32I-NEXT: andi a7, a1, 31 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: sll t0, t0, a7 -; RV32I-NEXT: or a6, a6, t0 +; RV32I-NEXT: srl a7, a5, a1 +; RV32I-NEXT: slli t0, a6, 1 ; RV32I-NEXT: srl a4, a4, a1 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a3, a3, a7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: srl a4, a5, a1 -; RV32I-NEXT: slli a5, a0, 1 -; RV32I-NEXT: sll a5, a5, a7 -; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli t1, a0, 1 ; RV32I-NEXT: srl a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: srli t0, a0, 16 +; RV32I-NEXT: srli t1, a0, 24 +; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a0, 12(a2) -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a5, 15(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a5, a4, 8 -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a6, 16 -; RV32I-NEXT: srli a1, a6, 24 -; RV32I-NEXT: srli a3, a6, 8 -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: srli t0, a4, 24 +; RV32I-NEXT: srli t1, a4, 8 +; RV32I-NEXT: srli t2, a1, 16 +; RV32I-NEXT: srli t3, a1, 24 +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a6, 9(a2) +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a0, a1, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t1, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb t3, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -825,231 +825,231 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 2(a1) -; RV64I-NEXT: lbu a7, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a5, a1, a4 -; RV64I-NEXT: addi a4, a5, -64 -; RV64I-NEXT: sll a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB7_2 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a4, a1, a6 +; RV64I-NEXT: addi a3, a4, -64 +; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: bltz a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: lbu t1, 13(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 13(a0) +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: sll a0, a0, a5 -; RV64I-NEXT: not a5, a5 -; RV64I-NEXT: srli a3, a3, 1 -; RV64I-NEXT: srl a3, a3, a5 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: srl a4, a5, a7 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB7_3: -; RV64I-NEXT: srai a4, a4, 63 -; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: srai a3, a3, 63 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: srli a5, a0, 48 +; RV64I-NEXT: srli a6, a0, 40 +; RV64I-NEXT: srli a7, a0, 32 +; RV64I-NEXT: srli t0, a0, 24 +; RV64I-NEXT: srli t1, a0, 16 +; RV64I-NEXT: and a1, a3, a1 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a6, 13(a2) +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: sb a4, 15(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t2, a1, 16 ; RV64I-NEXT: sb a6, 4(a2) ; RV64I-NEXT: sb a5, 5(a2) ; RV64I-NEXT: sb a4, 6(a2) ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a1, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb t2, 2(a2) +; RV64I-NEXT: sb a7, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t1, 10(a2) +; RV64I-NEXT: sb t0, 11(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t4, 1(a1) +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t2, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t1 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: addi t2, sp, 16 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a3, 16(sp) ; RV32I-NEXT: sw a4, 20(sp) ; RV32I-NEXT: sw a5, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) ; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a3, a3, a0 -; RV32I-NEXT: lw a0, 4(a3) -; RV32I-NEXT: lw a4, 0(a3) -; RV32I-NEXT: lw a5, 8(a3) -; RV32I-NEXT: lw a3, 12(a3) -; RV32I-NEXT: sll a6, a0, a1 -; RV32I-NEXT: andi a7, a1, 31 -; RV32I-NEXT: xori a7, a7, 31 +; RV32I-NEXT: sub a0, t2, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: sll a7, a5, a1 ; RV32I-NEXT: srli t0, a4, 1 -; RV32I-NEXT: srl t0, t0, a7 -; RV32I-NEXT: or a6, a6, t0 -; RV32I-NEXT: sll a3, a3, a1 -; RV32I-NEXT: srli t0, a5, 1 -; RV32I-NEXT: srl t0, t0, a7 -; RV32I-NEXT: or a3, a3, t0 -; RV32I-NEXT: sll a5, a5, a1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: or a0, a5, a0 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: srli t1, a6, 1 +; RV32I-NEXT: sll a6, a6, a1 +; RV32I-NEXT: srli a5, a5, 1 ; RV32I-NEXT: sll a1, a4, a1 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 24 -; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: srl a4, t0, a3 +; RV32I-NEXT: srl t0, t1, a3 +; RV32I-NEXT: srl a3, a5, a3 +; RV32I-NEXT: srli a5, a1, 16 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: srli a5, a0, 8 -; RV32I-NEXT: sb a0, 8(a2) -; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: sb t2, 1(a2) +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb t1, 3(a2) +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a0, 16 +; RV32I-NEXT: srli t0, a0, 24 +; RV32I-NEXT: srli t1, a0, 8 +; RV32I-NEXT: srli t2, a4, 16 +; RV32I-NEXT: srli t3, a4, 24 +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a6, 9(a2) ; RV32I-NEXT: sb a1, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a4, 13(a2) -; RV32I-NEXT: sb a0, 14(a2) -; RV32I-NEXT: sb a1, 15(a2) -; RV32I-NEXT: srli a0, a6, 16 -; RV32I-NEXT: srli a1, a6, 24 -; RV32I-NEXT: srli a3, a6, 8 -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a1, a4, 8 +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: sb a7, 14(a2) +; RV32I-NEXT: sb t0, 15(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb t3, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1061,232 +1061,232 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 ; RV64I-NEXT: lbu a6, 0(a1) ; RV64I-NEXT: lbu a7, 1(a1) -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: lbu t0, 3(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t3, t3, 24 +; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 ; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t0, 5(a1) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 6(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, t1, a5 +; RV64I-NEXT: or a6, t0, a6 ; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a4, a5, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: addi a6, a5, -64 -; RV64I-NEXT: sra a1, a3, a5 +; RV64I-NEXT: or a4, a4, a3 +; RV64I-NEXT: or a3, a1, a6 +; RV64I-NEXT: addi a6, a3, -64 +; RV64I-NEXT: sra a1, a4, a3 ; RV64I-NEXT: bltz a6, .LBB8_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a4, 31 +; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 5(a0) -; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu a5, 1(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) +; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: srl a0, a0, a5 -; RV64I-NEXT: not a4, a5 -; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a3, a3, a4 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: srl a0, a0, a3 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB8_3: ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: srli a4, a1, 48 ; RV64I-NEXT: srli a5, a1, 40 ; RV64I-NEXT: srli a6, a1, 32 +; RV64I-NEXT: srli a7, a1, 24 +; RV64I-NEXT: srli t0, a1, 16 +; RV64I-NEXT: srli t1, a1, 8 +; RV64I-NEXT: srli t2, a0, 56 +; RV64I-NEXT: srli t3, a0, 48 +; RV64I-NEXT: srli t4, a0, 40 +; RV64I-NEXT: srli t5, a0, 32 ; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb a5, 13(a2) ; RV64I-NEXT: sb a4, 14(a2) ; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: srli a3, a0, 24 ; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: sb a4, 5(a2) -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: sb t1, 9(a2) +; RV64I-NEXT: sb t0, 10(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb t5, 4(a2) +; RV64I-NEXT: sb t4, 5(a2) +; RV64I-NEXT: sb t3, 6(a2) +; RV64I-NEXT: sb t2, 7(a2) ; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a3, 3(a2) ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_16bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu t0, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, a0, a6 -; RV32I-NEXT: lbu t0, 0(a1) -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or t3, t5, t4 +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 1(a1) ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, sp +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t1, a0, t2 ; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) ; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a6, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a7, 12(sp) ; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a0, a3, a0 -; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: add a0, a5, a0 ; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a5, 4(a0) +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a6, a3, a1 -; RV32I-NEXT: andi a7, a1, 31 -; RV32I-NEXT: xori a7, a7, 31 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: sll t0, t0, a7 -; RV32I-NEXT: or a6, a6, t0 +; RV32I-NEXT: srl a7, a5, a1 +; RV32I-NEXT: slli t0, a6, 1 ; RV32I-NEXT: srl a4, a4, a1 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a3, a3, a7 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: srl a4, a5, a1 -; RV32I-NEXT: slli a5, a0, 1 -; RV32I-NEXT: sll a5, a5, a7 -; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli t1, a0, 1 ; RV32I-NEXT: sra a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: srli t0, a0, 16 +; RV32I-NEXT: srli t1, a0, 24 +; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: or a3, a6, a3 ; RV32I-NEXT: sb a0, 12(a2) -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: sb a1, 14(a2) -; RV32I-NEXT: sb a5, 15(a2) -; RV32I-NEXT: srli a0, a4, 16 -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: srli a5, a4, 8 -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: srli a4, a3, 8 -; RV32I-NEXT: sb a3, 0(a2) -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: sb a0, 2(a2) -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a6, 16 -; RV32I-NEXT: srli a1, a6, 24 -; RV32I-NEXT: srli a3, a6, 8 -; RV32I-NEXT: sb a6, 4(a2) -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: srli t0, a4, 24 +; RV32I-NEXT: srli t1, a4, 8 +; RV32I-NEXT: srli t2, a1, 16 +; RV32I-NEXT: srli t3, a1, 24 +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a6, 9(a2) +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a0, a1, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t1, 1(a2) +; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb t2, 6(a2) +; RV32I-NEXT: sb t3, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1299,420 +1299,472 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli s8, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a6, a6, s8 +; RV64I-NEXT: or a3, t0, a7 +; RV64I-NEXT: or a4, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a6, a1, a6 +; RV64I-NEXT: or a1, a1, s7 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: mv a6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t1, s0, t6 +; RV64I-NEXT: or t2, s5, s1 +; RV64I-NEXT: or t3, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t2, t2, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a3, a5 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: srli a0, a6, 3 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: ld a1, 8(a0) -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: ld a4, 16(a0) -; RV64I-NEXT: ld a5, 24(a0) -; RV64I-NEXT: srl a0, a1, a6 -; RV64I-NEXT: andi a7, a6, 63 -; RV64I-NEXT: xori a7, a7, 63 -; RV64I-NEXT: slli t0, a4, 1 -; RV64I-NEXT: sll t0, t0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: srl a3, a3, a6 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: sll a1, a1, a7 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: srl a3, a4, a6 -; RV64I-NEXT: slli a4, a5, 1 -; RV64I-NEXT: sll a4, a4, a7 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: srl a4, a5, a6 -; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: srli a7, a4, 40 -; RV64I-NEXT: srli t0, a4, 32 -; RV64I-NEXT: sb t0, 28(a2) -; RV64I-NEXT: sb a7, 29(a2) -; RV64I-NEXT: sb a6, 30(a2) +; RV64I-NEXT: add a0, a6, a0 +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a5, 8(a0) +; RV64I-NEXT: ld a6, 16(a0) +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: srl a7, a5, a1 +; RV64I-NEXT: slli t0, a6, 1 +; RV64I-NEXT: srl a4, a4, a1 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli t1, a0, 1 +; RV64I-NEXT: srl t2, a0, a1 +; RV64I-NEXT: sll a0, t0, a3 +; RV64I-NEXT: sll a1, a5, a3 +; RV64I-NEXT: sll a3, t1, a3 +; RV64I-NEXT: srli a5, t2, 56 +; RV64I-NEXT: srli t0, t2, 48 +; RV64I-NEXT: srli t1, t2, 40 +; RV64I-NEXT: srli t3, t2, 32 +; RV64I-NEXT: srli t4, t2, 24 +; RV64I-NEXT: srli t5, t2, 16 +; RV64I-NEXT: srli t6, t2, 8 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: sb t3, 28(a2) +; RV64I-NEXT: sb t1, 29(a2) +; RV64I-NEXT: sb t0, 30(a2) ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 -; RV64I-NEXT: sb a4, 24(a2) -; RV64I-NEXT: sb a7, 25(a2) -; RV64I-NEXT: sb a6, 26(a2) -; RV64I-NEXT: sb a5, 27(a2) +; RV64I-NEXT: sb t2, 24(a2) +; RV64I-NEXT: sb t6, 25(a2) +; RV64I-NEXT: sb t5, 26(a2) +; RV64I-NEXT: sb t4, 27(a2) ; RV64I-NEXT: srli a4, a3, 56 ; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: srli a6, a3, 40 ; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srli t0, a3, 24 +; RV64I-NEXT: srli t1, a3, 16 +; RV64I-NEXT: srli t2, a3, 8 +; RV64I-NEXT: srli t3, a1, 56 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 40 +; RV64I-NEXT: srli t6, a1, 32 +; RV64I-NEXT: srli s0, a1, 24 +; RV64I-NEXT: srli s1, a1, 16 +; RV64I-NEXT: srli s2, a1, 8 +; RV64I-NEXT: srli s3, a0, 56 +; RV64I-NEXT: srli s4, a0, 48 +; RV64I-NEXT: srli s5, a0, 40 ; RV64I-NEXT: sb a7, 20(a2) ; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: sb a5, 22(a2) ; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb t2, 17(a2) +; RV64I-NEXT: sb t1, 18(a2) +; RV64I-NEXT: sb t0, 19(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb t6, 4(a2) +; RV64I-NEXT: sb t5, 5(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t3, 7(a2) +; RV64I-NEXT: srli a5, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb s2, 1(a2) +; RV64I-NEXT: sb s1, 2(a2) +; RV64I-NEXT: sb s0, 3(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: sb s5, 13(a2) +; RV64I-NEXT: sb s4, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a5, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a7, a6, a7 -; RV32I-NEXT: lbu a6, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: lbu a6, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or a6, t3, a6 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t1, a6, t1 -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: lbu a6, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) -; RV32I-NEXT: or a0, a0, t3 -; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t3 -; RV32I-NEXT: or a6, a1, a6 -; RV32I-NEXT: sw zero, 48(sp) -; RV32I-NEXT: sw zero, 52(sp) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) -; RV32I-NEXT: sw zero, 32(sp) -; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 64(sp) +; RV32I-NEXT: sw zero, 68(sp) ; RV32I-NEXT: sw zero, 40(sp) ; RV32I-NEXT: sw zero, 44(sp) -; RV32I-NEXT: sw t0, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: srli a0, a6, 3 -; RV32I-NEXT: andi a0, a0, 28 -; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: add a3, a1, a0 -; RV32I-NEXT: lw a1, 4(a3) -; RV32I-NEXT: lw a4, 0(a3) -; RV32I-NEXT: lw a5, 8(a3) -; RV32I-NEXT: lw a7, 12(a3) -; RV32I-NEXT: srl a0, a1, a6 -; RV32I-NEXT: andi t0, a6, 31 -; RV32I-NEXT: xori t0, t0, 31 -; RV32I-NEXT: slli t1, a5, 1 -; RV32I-NEXT: sll t1, t1, t0 -; RV32I-NEXT: or a0, a0, t1 -; RV32I-NEXT: srl a4, a4, a6 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: sll a1, a1, t0 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: srl a4, a7, a6 -; RV32I-NEXT: lw t1, 16(a3) -; RV32I-NEXT: lw t2, 20(a3) -; RV32I-NEXT: lw t3, 24(a3) -; RV32I-NEXT: lw t4, 28(a3) -; RV32I-NEXT: slli a3, t1, 1 -; RV32I-NEXT: sll a3, a3, t0 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: srl a4, a5, a6 +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw t2, 32(sp) +; RV32I-NEXT: sw t3, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: srli a1, a0, 3 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: add a4, s3, a4 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: lw a5, 4(a4) +; RV32I-NEXT: lw a6, 8(a4) +; RV32I-NEXT: lw a7, 12(a4) +; RV32I-NEXT: lw t0, 16(a4) +; RV32I-NEXT: lw t1, 20(a4) +; RV32I-NEXT: lw t2, 24(a4) +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: srl t3, a5, a0 +; RV32I-NEXT: slli t4, a6, 1 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl t5, a7, a0 +; RV32I-NEXT: slli t6, t0, 1 +; RV32I-NEXT: srl a6, a6, a0 ; RV32I-NEXT: slli a7, a7, 1 -; RV32I-NEXT: sll a5, a7, t0 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: srl a5, t2, a6 -; RV32I-NEXT: slli a7, t3, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: srl a7, t1, a6 -; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: sll t1, t2, t0 -; RV32I-NEXT: or a7, a7, t1 -; RV32I-NEXT: srl t1, t3, a6 -; RV32I-NEXT: slli t2, t4, 1 -; RV32I-NEXT: sll t0, t2, t0 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: srl a6, t4, a6 -; RV32I-NEXT: srli t1, a6, 24 -; RV32I-NEXT: srli t2, a6, 16 -; RV32I-NEXT: srli t3, a6, 8 -; RV32I-NEXT: sb a6, 28(a2) -; RV32I-NEXT: sb t3, 29(a2) -; RV32I-NEXT: sb t2, 30(a2) -; RV32I-NEXT: sb t1, 31(a2) -; RV32I-NEXT: srli a6, t0, 24 -; RV32I-NEXT: srli t1, t0, 16 -; RV32I-NEXT: srli t2, t0, 8 -; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srl s0, t1, a0 +; RV32I-NEXT: slli s1, t2, 1 +; RV32I-NEXT: srl t0, t0, a0 +; RV32I-NEXT: slli t1, t1, 1 +; RV32I-NEXT: srl t2, t2, a0 +; RV32I-NEXT: slli s2, a4, 1 +; RV32I-NEXT: srl s3, a4, a0 +; RV32I-NEXT: sll a0, t4, a1 +; RV32I-NEXT: sll a4, a5, a1 +; RV32I-NEXT: sll a5, t6, a1 +; RV32I-NEXT: sll a7, a7, a1 +; RV32I-NEXT: sll t4, s1, a1 +; RV32I-NEXT: sll t1, t1, a1 +; RV32I-NEXT: sll t6, s2, a1 +; RV32I-NEXT: srli s1, s3, 24 +; RV32I-NEXT: srli s2, s3, 16 +; RV32I-NEXT: srli s4, s3, 8 +; RV32I-NEXT: or a0, t3, a0 +; RV32I-NEXT: or a1, a3, a4 +; RV32I-NEXT: or a3, t5, a5 +; RV32I-NEXT: or a4, a6, a7 +; RV32I-NEXT: or a5, s0, t4 +; RV32I-NEXT: or a6, t0, t1 +; RV32I-NEXT: or a7, t2, t6 +; RV32I-NEXT: sb s3, 28(a2) +; RV32I-NEXT: sb s4, 29(a2) +; RV32I-NEXT: sb s2, 30(a2) +; RV32I-NEXT: sb s1, 31(a2) +; RV32I-NEXT: srli t0, a7, 24 +; RV32I-NEXT: srli t1, a7, 16 +; RV32I-NEXT: srli t2, a7, 8 +; RV32I-NEXT: srli t3, a6, 24 +; RV32I-NEXT: srli t4, a6, 16 +; RV32I-NEXT: srli t5, a6, 8 +; RV32I-NEXT: srli t6, a5, 24 +; RV32I-NEXT: srli s0, a5, 16 +; RV32I-NEXT: srli s1, a5, 8 +; RV32I-NEXT: srli s2, a4, 24 +; RV32I-NEXT: srli s3, a4, 16 +; RV32I-NEXT: srli s4, a4, 8 +; RV32I-NEXT: srli s5, a3, 24 +; RV32I-NEXT: srli s6, a3, 16 +; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 +; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t2, 25(a2) ; RV32I-NEXT: sb t1, 26(a2) -; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a7, 24 -; RV32I-NEXT: srli t0, a7, 16 -; RV32I-NEXT: srli t1, a7, 8 -; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb t0, 18(a2) -; RV32I-NEXT: sb a6, 19(a2) -; RV32I-NEXT: srli a6, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t0, a5, 8 +; RV32I-NEXT: sb t0, 27(a2) +; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb t5, 17(a2) +; RV32I-NEXT: sb t4, 18(a2) +; RV32I-NEXT: sb t3, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 ; RV32I-NEXT: sb a5, 20(a2) -; RV32I-NEXT: sb t0, 21(a2) -; RV32I-NEXT: sb a7, 22(a2) -; RV32I-NEXT: sb a6, 23(a2) -; RV32I-NEXT: srli a5, a4, 24 -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: sb s0, 22(a2) +; RV32I-NEXT: sb t6, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) -; RV32I-NEXT: sb a6, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: sb s4, 9(a2) +; RV32I-NEXT: sb s3, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s7, 13(a2) +; RV32I-NEXT: sb s6, 14(a2) +; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: sb s9, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -1723,420 +1775,472 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli s8, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 1(a1) -; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a6, a6, s8 +; RV64I-NEXT: or a3, t0, a7 +; RV64I-NEXT: or a4, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a6, a1, a6 +; RV64I-NEXT: or a1, a1, s7 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: addi a6, sp, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t1, s0, t6 +; RV64I-NEXT: or t2, s5, s1 +; RV64I-NEXT: or t3, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t2, t2, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a3, a5 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a5, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: srli a0, a6, 3 +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a5, 56(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: addi a1, sp, 32 -; RV64I-NEXT: sub a1, a1, a0 -; RV64I-NEXT: ld a3, 8(a1) -; RV64I-NEXT: ld a4, 0(a1) -; RV64I-NEXT: ld a5, 16(a1) -; RV64I-NEXT: ld a1, 24(a1) -; RV64I-NEXT: sll a0, a3, a6 -; RV64I-NEXT: andi a7, a6, 63 -; RV64I-NEXT: xori a7, a7, 63 +; RV64I-NEXT: sub a0, a6, a0 +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a5, 8(a0) +; RV64I-NEXT: ld a6, 16(a0) +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: sll a7, a5, a1 ; RV64I-NEXT: srli t0, a4, 1 -; RV64I-NEXT: srl t0, t0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: sll a1, a1, a6 -; RV64I-NEXT: srli t0, a5, 1 -; RV64I-NEXT: srl t0, t0, a7 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: sll a5, a5, a6 -; RV64I-NEXT: srli a3, a3, 1 -; RV64I-NEXT: srl a3, a3, a7 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: sll a4, a4, a6 +; RV64I-NEXT: sll t1, a0, a1 +; RV64I-NEXT: srli a0, a6, 1 +; RV64I-NEXT: sll a6, a6, a1 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: sll a4, a4, a1 +; RV64I-NEXT: srl a1, t0, a3 +; RV64I-NEXT: srl t0, a0, a3 +; RV64I-NEXT: srl a3, a5, a3 ; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: srli a7, a4, 40 -; RV64I-NEXT: srli t0, a4, 32 -; RV64I-NEXT: sb t0, 4(a2) -; RV64I-NEXT: sb a7, 5(a2) -; RV64I-NEXT: sb a6, 6(a2) +; RV64I-NEXT: srli t2, a4, 48 +; RV64I-NEXT: srli t3, a4, 40 +; RV64I-NEXT: srli t4, a4, 32 +; RV64I-NEXT: srli t5, a4, 24 +; RV64I-NEXT: srli t6, a4, 16 +; RV64I-NEXT: srli s0, a4, 8 +; RV64I-NEXT: or a0, a7, a1 +; RV64I-NEXT: or a1, t1, t0 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: sb t4, 4(a2) +; RV64I-NEXT: sb t3, 5(a2) +; RV64I-NEXT: sb t2, 6(a2) ; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 ; RV64I-NEXT: sb a4, 0(a2) -; RV64I-NEXT: sb a7, 1(a2) -; RV64I-NEXT: sb a6, 2(a2) -; RV64I-NEXT: sb a5, 3(a2) +; RV64I-NEXT: sb s0, 1(a2) +; RV64I-NEXT: sb t6, 2(a2) +; RV64I-NEXT: sb t5, 3(a2) ; RV64I-NEXT: srli a4, a3, 56 ; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: srli a6, a3, 40 ; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srli t0, a3, 24 +; RV64I-NEXT: srli t1, a3, 16 +; RV64I-NEXT: srli t2, a3, 8 +; RV64I-NEXT: srli t3, a1, 56 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 40 +; RV64I-NEXT: srli t6, a1, 32 +; RV64I-NEXT: srli s0, a1, 24 +; RV64I-NEXT: srli s1, a1, 16 +; RV64I-NEXT: srli s2, a1, 8 +; RV64I-NEXT: srli s3, a0, 56 +; RV64I-NEXT: srli s4, a0, 48 +; RV64I-NEXT: srli s5, a0, 40 ; RV64I-NEXT: sb a7, 20(a2) ; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: sb a5, 22(a2) ; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 28(a2) -; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: sb a4, 30(a2) -; RV64I-NEXT: sb a3, 31(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 +; RV64I-NEXT: sb t2, 17(a2) +; RV64I-NEXT: sb t1, 18(a2) +; RV64I-NEXT: sb t0, 19(a2) +; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: sb t6, 28(a2) +; RV64I-NEXT: sb t5, 29(a2) +; RV64I-NEXT: sb t4, 30(a2) +; RV64I-NEXT: sb t3, 31(a2) +; RV64I-NEXT: srli a5, a0, 16 ; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: sb a4, 26(a2) -; RV64I-NEXT: sb a3, 27(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 -; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb s2, 25(a2) +; RV64I-NEXT: sb s1, 26(a2) +; RV64I-NEXT: sb s0, 27(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: sb s5, 13(a2) +; RV64I-NEXT: sb s4, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a5, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a6, a6, a7 -; RV32I-NEXT: lbu a7, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t2, a7 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or t0, a7, t0 -; RV32I-NEXT: lbu a7, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or a7, t3, a7 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t1, a7, t1 -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, a7 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t2, a7, t2 -; RV32I-NEXT: lbu a7, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) -; RV32I-NEXT: or a0, a0, t3 -; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or a7, t4, a7 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t3 -; RV32I-NEXT: or a7, a1, a7 -; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: sw zero, 0(sp) -; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw zero, 36(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: sw t0, 48(sp) -; RV32I-NEXT: sw t1, 52(sp) -; RV32I-NEXT: sw t2, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a3, 32(sp) -; RV32I-NEXT: sw a4, 36(sp) -; RV32I-NEXT: sw a5, 40(sp) -; RV32I-NEXT: sw a6, 44(sp) -; RV32I-NEXT: srli a0, a7, 3 -; RV32I-NEXT: andi a0, a0, 28 -; RV32I-NEXT: addi a1, sp, 32 -; RV32I-NEXT: sub a4, a1, a0 -; RV32I-NEXT: lw a3, 4(a4) -; RV32I-NEXT: lw a5, 0(a4) -; RV32I-NEXT: lw a6, 8(a4) -; RV32I-NEXT: lw t0, 12(a4) -; RV32I-NEXT: sll a0, a3, a7 -; RV32I-NEXT: andi a1, a7, 31 -; RV32I-NEXT: xori t1, a1, 31 -; RV32I-NEXT: srli a1, a5, 1 -; RV32I-NEXT: srl a1, a1, t1 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: sll a1, t0, a7 -; RV32I-NEXT: srli t2, a6, 1 -; RV32I-NEXT: srl t2, t2, t1 -; RV32I-NEXT: or a1, a1, t2 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: srl a3, a3, t1 -; RV32I-NEXT: lw t2, 16(a4) -; RV32I-NEXT: lw t3, 20(a4) -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: lw a6, 24(a4) -; RV32I-NEXT: lw t4, 28(a4) -; RV32I-NEXT: sll a4, t3, a7 -; RV32I-NEXT: srli t5, t2, 1 -; RV32I-NEXT: srl t5, t5, t1 -; RV32I-NEXT: or a4, a4, t5 -; RV32I-NEXT: sll t2, t2, a7 -; RV32I-NEXT: srli t0, t0, 1 -; RV32I-NEXT: srl t0, t0, t1 -; RV32I-NEXT: or t0, t2, t0 -; RV32I-NEXT: sll t2, t4, a7 -; RV32I-NEXT: srli t4, a6, 1 -; RV32I-NEXT: srl t4, t4, t1 -; RV32I-NEXT: or t2, t2, t4 -; RV32I-NEXT: sll a6, a6, a7 -; RV32I-NEXT: srli t3, t3, 1 -; RV32I-NEXT: srl t1, t3, t1 -; RV32I-NEXT: or a6, a6, t1 -; RV32I-NEXT: sll a5, a5, a7 -; RV32I-NEXT: srli a7, a5, 24 -; RV32I-NEXT: srli t1, a5, 16 -; RV32I-NEXT: srli t3, a5, 8 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: sb t3, 1(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb a7, 3(a2) -; RV32I-NEXT: srli a5, a6, 24 -; RV32I-NEXT: srli a7, a6, 16 -; RV32I-NEXT: srli t1, a6, 8 -; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb t1, 25(a2) -; RV32I-NEXT: sb a7, 26(a2) -; RV32I-NEXT: sb a5, 27(a2) -; RV32I-NEXT: srli a5, t2, 24 -; RV32I-NEXT: srli a6, t2, 16 -; RV32I-NEXT: srli a7, t2, 8 -; RV32I-NEXT: sb t2, 28(a2) -; RV32I-NEXT: sb a7, 29(a2) -; RV32I-NEXT: sb a6, 30(a2) -; RV32I-NEXT: sb a5, 31(a2) -; RV32I-NEXT: srli a5, t0, 24 -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: srli a7, t0, 8 -; RV32I-NEXT: sb t0, 16(a2) -; RV32I-NEXT: sb a7, 17(a2) -; RV32I-NEXT: sb a6, 18(a2) -; RV32I-NEXT: sb a5, 19(a2) -; RV32I-NEXT: srli a5, a4, 24 -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 40 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 56(sp) +; RV32I-NEXT: sw t1, 60(sp) +; RV32I-NEXT: sw t2, 64(sp) +; RV32I-NEXT: sw t3, 68(sp) +; RV32I-NEXT: sw a4, 40(sp) +; RV32I-NEXT: sw a5, 44(sp) +; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: sw a7, 52(sp) +; RV32I-NEXT: srli a1, a0, 3 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: sub a3, s3, a4 +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: lw a5, 4(a3) +; RV32I-NEXT: lw a6, 8(a3) +; RV32I-NEXT: lw a7, 12(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw t1, 20(a3) +; RV32I-NEXT: lw t2, 24(a3) +; RV32I-NEXT: lw a3, 28(a3) +; RV32I-NEXT: sll t3, a5, a0 +; RV32I-NEXT: srli t4, a4, 1 +; RV32I-NEXT: sll t5, a7, a0 +; RV32I-NEXT: srli t6, a6, 1 +; RV32I-NEXT: sll a6, a6, a0 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: sll s0, t1, a0 +; RV32I-NEXT: srli s1, t0, 1 +; RV32I-NEXT: sll t0, t0, a0 +; RV32I-NEXT: srli a7, a7, 1 +; RV32I-NEXT: sll s2, a3, a0 +; RV32I-NEXT: srli a3, t2, 1 +; RV32I-NEXT: sll t2, t2, a0 +; RV32I-NEXT: srli t1, t1, 1 +; RV32I-NEXT: sll s3, a4, a0 +; RV32I-NEXT: srl a0, t4, a1 +; RV32I-NEXT: srl a4, t6, a1 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: srl t4, s1, a1 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: srl t6, a3, a1 +; RV32I-NEXT: srl t1, t1, a1 +; RV32I-NEXT: srli s1, s3, 24 +; RV32I-NEXT: srli s4, s3, 16 +; RV32I-NEXT: srli s5, s3, 8 +; RV32I-NEXT: or a0, t3, a0 +; RV32I-NEXT: or a1, t5, a4 +; RV32I-NEXT: or a3, a6, a5 +; RV32I-NEXT: or a4, s0, t4 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, s2, t6 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: sb s3, 0(a2) +; RV32I-NEXT: sb s5, 1(a2) +; RV32I-NEXT: sb s4, 2(a2) +; RV32I-NEXT: sb s1, 3(a2) +; RV32I-NEXT: srli t0, a7, 24 +; RV32I-NEXT: srli t1, a7, 16 +; RV32I-NEXT: srli t2, a7, 8 +; RV32I-NEXT: srli t3, a6, 24 +; RV32I-NEXT: srli t4, a6, 16 +; RV32I-NEXT: srli t5, a6, 8 +; RV32I-NEXT: srli t6, a5, 24 +; RV32I-NEXT: srli s0, a5, 16 +; RV32I-NEXT: srli s1, a5, 8 +; RV32I-NEXT: srli s2, a4, 24 +; RV32I-NEXT: srli s3, a4, 16 +; RV32I-NEXT: srli s4, a4, 8 +; RV32I-NEXT: srli s5, a3, 24 +; RV32I-NEXT: srli s6, a3, 16 +; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 +; RV32I-NEXT: sb a7, 24(a2) +; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: sb t0, 27(a2) +; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb t5, 29(a2) +; RV32I-NEXT: sb t4, 30(a2) +; RV32I-NEXT: sb t3, 31(a2) +; RV32I-NEXT: srli a6, a0, 24 +; RV32I-NEXT: sb a5, 16(a2) +; RV32I-NEXT: sb s1, 17(a2) +; RV32I-NEXT: sb s0, 18(a2) +; RV32I-NEXT: sb t6, 19(a2) +; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 20(a2) -; RV32I-NEXT: sb a7, 21(a2) -; RV32I-NEXT: sb a6, 22(a2) -; RV32I-NEXT: sb a5, 23(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: sb s4, 21(a2) +; RV32I-NEXT: sb s3, 22(a2) +; RV32I-NEXT: sb s2, 23(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s7, 9(a2) +; RV32I-NEXT: sb s6, 10(a2) +; RV32I-NEXT: sb s5, 11(a2) ; RV32I-NEXT: sb a1, 12(a2) -; RV32I-NEXT: sb a5, 13(a2) -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a7, 13(a2) +; RV32I-NEXT: sb s9, 14(a2) +; RV32I-NEXT: sb s8, 15(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2147,422 +2251,474 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: lbu a5, 8(a0) -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a7, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a6, 12(a0) -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: lbu a6, 16(a0) -; RV64I-NEXT: lbu a7, 17(a0) -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 18(a0) -; RV64I-NEXT: lbu t0, 19(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu a7, 20(a0) -; RV64I-NEXT: lbu t0, 21(a0) -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 22(a0) -; RV64I-NEXT: lbu t1, 23(a0) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: lbu t0, 28(a0) -; RV64I-NEXT: lbu t1, 29(a0) -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: slli a7, a0, 32 -; RV64I-NEXT: lbu t0, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu t2, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: lbu t1, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t3, s5, s4 +; RV64I-NEXT: or t4, s7, s6 +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: lbu s6, 5(a1) +; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli s6, s6, 8 +; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s4, s2 +; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t3, t1, 32 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a7, a1, a7 -; RV64I-NEXT: sraiw a0, a0, 31 -; RV64I-NEXT: sd a0, 32(sp) -; RV64I-NEXT: sd a0, 40(sp) -; RV64I-NEXT: sd a0, 48(sp) -; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sraiw t1, t1, 31 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t3, t0 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: sd t1, 32(sp) +; RV64I-NEXT: sd t1, 40(sp) +; RV64I-NEXT: sd t1, 48(sp) +; RV64I-NEXT: sd t1, 56(sp) ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a6, 24(sp) -; RV64I-NEXT: srli a0, a7, 3 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: ld a1, 8(a0) -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: ld a4, 16(a0) -; RV64I-NEXT: ld a5, 24(a0) -; RV64I-NEXT: srl a0, a1, a7 -; RV64I-NEXT: andi a6, a7, 63 -; RV64I-NEXT: xori a6, a6, 63 -; RV64I-NEXT: slli t0, a4, 1 -; RV64I-NEXT: sll t0, t0, a6 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: srl a3, a3, a7 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: sll a1, a1, a6 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: srl a3, a4, a7 -; RV64I-NEXT: slli a4, a5, 1 -; RV64I-NEXT: sll a4, a4, a6 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: sra a4, a5, a7 -; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: srli a7, a4, 40 -; RV64I-NEXT: srli t0, a4, 32 -; RV64I-NEXT: sb t0, 28(a2) -; RV64I-NEXT: sb a7, 29(a2) -; RV64I-NEXT: sb a6, 30(a2) +; RV64I-NEXT: add a0, s6, a0 +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a5, 8(a0) +; RV64I-NEXT: ld a6, 16(a0) +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: srl a7, a5, a1 +; RV64I-NEXT: slli t0, a6, 1 +; RV64I-NEXT: srl a4, a4, a1 +; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli t1, a0, 1 +; RV64I-NEXT: sra t2, a0, a1 +; RV64I-NEXT: sll a0, t0, a3 +; RV64I-NEXT: sll a1, a5, a3 +; RV64I-NEXT: sll a3, t1, a3 +; RV64I-NEXT: srli a5, t2, 56 +; RV64I-NEXT: srli t0, t2, 48 +; RV64I-NEXT: srli t1, t2, 40 +; RV64I-NEXT: srli t3, t2, 32 +; RV64I-NEXT: srli t4, t2, 24 +; RV64I-NEXT: srli t5, t2, 16 +; RV64I-NEXT: srli t6, t2, 8 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: sb t3, 28(a2) +; RV64I-NEXT: sb t1, 29(a2) +; RV64I-NEXT: sb t0, 30(a2) ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: srli a7, a4, 8 -; RV64I-NEXT: sb a4, 24(a2) -; RV64I-NEXT: sb a7, 25(a2) -; RV64I-NEXT: sb a6, 26(a2) -; RV64I-NEXT: sb a5, 27(a2) +; RV64I-NEXT: sb t2, 24(a2) +; RV64I-NEXT: sb t6, 25(a2) +; RV64I-NEXT: sb t5, 26(a2) +; RV64I-NEXT: sb t4, 27(a2) ; RV64I-NEXT: srli a4, a3, 56 ; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: srli a6, a3, 40 ; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srli t0, a3, 24 +; RV64I-NEXT: srli t1, a3, 16 +; RV64I-NEXT: srli t2, a3, 8 +; RV64I-NEXT: srli t3, a1, 56 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 40 +; RV64I-NEXT: srli t6, a1, 32 +; RV64I-NEXT: srli s0, a1, 24 +; RV64I-NEXT: srli s1, a1, 16 +; RV64I-NEXT: srli s2, a1, 8 +; RV64I-NEXT: srli s3, a0, 56 +; RV64I-NEXT: srli s4, a0, 48 +; RV64I-NEXT: srli s5, a0, 40 +; RV64I-NEXT: srli s6, a0, 32 ; RV64I-NEXT: sb a7, 20(a2) ; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: sb a5, 22(a2) ; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: srli a6, a3, 8 +; RV64I-NEXT: srli a4, a0, 24 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: srli a6, a1, 32 -; RV64I-NEXT: sb a6, 4(a2) -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: sb a4, 6(a2) -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: srli a5, a1, 8 -; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: sb a4, 2(a2) -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb t2, 17(a2) +; RV64I-NEXT: sb t1, 18(a2) +; RV64I-NEXT: sb t0, 19(a2) ; RV64I-NEXT: srli a3, a0, 16 -; RV64I-NEXT: srli a4, a0, 8 +; RV64I-NEXT: sb t6, 4(a2) +; RV64I-NEXT: sb t5, 5(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t3, 7(a2) +; RV64I-NEXT: srli a5, a0, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s2, 1(a2) +; RV64I-NEXT: sb s1, 2(a2) +; RV64I-NEXT: sb s0, 3(a2) +; RV64I-NEXT: sb s6, 12(a2) +; RV64I-NEXT: sb s5, 13(a2) +; RV64I-NEXT: sb s4, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb a5, 9(a2) ; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 116(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 112(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 104(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 96(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a7, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: lbu a5, 10(a0) -; RV32I-NEXT: lbu t0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: lbu a7, 12(a0) -; RV32I-NEXT: lbu t0, 13(a0) -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: lbu a6, 14(a0) -; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a4, a7, a6 +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu t1, 17(a0) -; RV32I-NEXT: or a7, a6, a7 -; RV32I-NEXT: lbu a6, 18(a0) -; RV32I-NEXT: lbu t2, 19(a0) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a6, t2, a6 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: lbu a6, 22(a0) -; RV32I-NEXT: lbu t3, 23(a0) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or a6, t3, a6 -; RV32I-NEXT: lbu t2, 24(a0) -; RV32I-NEXT: lbu t3, 25(a0) -; RV32I-NEXT: or t1, a6, t1 -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu t4, 27(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: lbu t3, 28(a0) -; RV32I-NEXT: lbu t4, 29(a0) -; RV32I-NEXT: or t2, a6, t2 -; RV32I-NEXT: lbu a6, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) ; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, a0, a6 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: or t3, a6, t3 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu ra, 24(a0) +; RV32I-NEXT: lbu a3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s0, 29(a0) +; RV32I-NEXT: lbu s1, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s2, s7, s6 +; RV32I-NEXT: or s3, s9, s8 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: lbu s5, 0(a1) +; RV32I-NEXT: lbu s6, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: addi s8, sp, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: or a6, a1, t4 -; RV32I-NEXT: srai a0, a0, 31 -; RV32I-NEXT: sw a0, 48(sp) -; RV32I-NEXT: sw a0, 52(sp) -; RV32I-NEXT: sw a0, 56(sp) -; RV32I-NEXT: sw a0, 60(sp) -; RV32I-NEXT: sw a0, 32(sp) -; RV32I-NEXT: sw a0, 36(sp) -; RV32I-NEXT: sw a0, 40(sp) -; RV32I-NEXT: sw a0, 44(sp) -; RV32I-NEXT: sw t0, 16(sp) -; RV32I-NEXT: sw t1, 20(sp) -; RV32I-NEXT: sw t2, 24(sp) -; RV32I-NEXT: sw t3, 28(sp) -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: srli a0, a6, 3 -; RV32I-NEXT: andi a0, a0, 28 -; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: add a3, a1, a0 -; RV32I-NEXT: lw a1, 4(a3) -; RV32I-NEXT: lw a4, 0(a3) -; RV32I-NEXT: lw a5, 8(a3) -; RV32I-NEXT: lw a7, 12(a3) -; RV32I-NEXT: srl a0, a1, a6 -; RV32I-NEXT: andi t0, a6, 31 -; RV32I-NEXT: xori t0, t0, 31 -; RV32I-NEXT: slli t1, a5, 1 -; RV32I-NEXT: sll t1, t1, t0 -; RV32I-NEXT: or a0, a0, t1 -; RV32I-NEXT: srl a4, a4, a6 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: sll a1, a1, t0 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: srl a4, a7, a6 -; RV32I-NEXT: lw t1, 16(a3) -; RV32I-NEXT: lw t2, 20(a3) -; RV32I-NEXT: lw t3, 24(a3) -; RV32I-NEXT: lw t4, 28(a3) -; RV32I-NEXT: slli a3, t1, 1 -; RV32I-NEXT: sll a3, a3, t0 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: srl a4, a5, a6 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or s1, a0, s1 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: srai s0, a0, 31 +; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, s2, t3 +; RV32I-NEXT: or t1, s4, s3 +; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: or t2, s1, t5 +; RV32I-NEXT: or a0, a1, t6 +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s0, 60(sp) +; RV32I-NEXT: sw s0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s0, 44(sp) +; RV32I-NEXT: sw s0, 48(sp) +; RV32I-NEXT: sw s0, 52(sp) +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: srli a1, a0, 3 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: add a4, s8, a4 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: lw a5, 4(a4) +; RV32I-NEXT: lw a6, 8(a4) +; RV32I-NEXT: lw a7, 12(a4) +; RV32I-NEXT: lw t0, 16(a4) +; RV32I-NEXT: lw t1, 20(a4) +; RV32I-NEXT: lw t2, 24(a4) +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: srl t3, a5, a0 +; RV32I-NEXT: slli t4, a6, 1 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl t5, a7, a0 +; RV32I-NEXT: slli t6, t0, 1 +; RV32I-NEXT: srl a6, a6, a0 ; RV32I-NEXT: slli a7, a7, 1 -; RV32I-NEXT: sll a5, a7, t0 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: srl a5, t2, a6 -; RV32I-NEXT: slli a7, t3, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: srl a7, t1, a6 -; RV32I-NEXT: slli t2, t2, 1 -; RV32I-NEXT: sll t1, t2, t0 -; RV32I-NEXT: or a7, a7, t1 -; RV32I-NEXT: srl t1, t3, a6 -; RV32I-NEXT: slli t2, t4, 1 -; RV32I-NEXT: sll t0, t2, t0 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sra a6, t4, a6 -; RV32I-NEXT: srli t1, a6, 24 -; RV32I-NEXT: srli t2, a6, 16 -; RV32I-NEXT: srli t3, a6, 8 -; RV32I-NEXT: sb a6, 28(a2) -; RV32I-NEXT: sb t3, 29(a2) -; RV32I-NEXT: sb t2, 30(a2) -; RV32I-NEXT: sb t1, 31(a2) -; RV32I-NEXT: srli a6, t0, 24 -; RV32I-NEXT: srli t1, t0, 16 -; RV32I-NEXT: srli t2, t0, 8 -; RV32I-NEXT: sb t0, 24(a2) +; RV32I-NEXT: srl s0, t1, a0 +; RV32I-NEXT: slli s1, t2, 1 +; RV32I-NEXT: srl t0, t0, a0 +; RV32I-NEXT: slli t1, t1, 1 +; RV32I-NEXT: srl t2, t2, a0 +; RV32I-NEXT: slli s2, a4, 1 +; RV32I-NEXT: sra s3, a4, a0 +; RV32I-NEXT: sll a0, t4, a1 +; RV32I-NEXT: sll a4, a5, a1 +; RV32I-NEXT: sll a5, t6, a1 +; RV32I-NEXT: sll a7, a7, a1 +; RV32I-NEXT: sll t4, s1, a1 +; RV32I-NEXT: sll t1, t1, a1 +; RV32I-NEXT: sll t6, s2, a1 +; RV32I-NEXT: srli s1, s3, 24 +; RV32I-NEXT: srli s2, s3, 16 +; RV32I-NEXT: srli s4, s3, 8 +; RV32I-NEXT: or a0, t3, a0 +; RV32I-NEXT: or a1, a3, a4 +; RV32I-NEXT: or a3, t5, a5 +; RV32I-NEXT: or a4, a6, a7 +; RV32I-NEXT: or a5, s0, t4 +; RV32I-NEXT: or a6, t0, t1 +; RV32I-NEXT: or a7, t2, t6 +; RV32I-NEXT: sb s3, 28(a2) +; RV32I-NEXT: sb s4, 29(a2) +; RV32I-NEXT: sb s2, 30(a2) +; RV32I-NEXT: sb s1, 31(a2) +; RV32I-NEXT: srli t0, a7, 24 +; RV32I-NEXT: srli t1, a7, 16 +; RV32I-NEXT: srli t2, a7, 8 +; RV32I-NEXT: srli t3, a6, 24 +; RV32I-NEXT: srli t4, a6, 16 +; RV32I-NEXT: srli t5, a6, 8 +; RV32I-NEXT: srli t6, a5, 24 +; RV32I-NEXT: srli s0, a5, 16 +; RV32I-NEXT: srli s1, a5, 8 +; RV32I-NEXT: srli s2, a4, 24 +; RV32I-NEXT: srli s3, a4, 16 +; RV32I-NEXT: srli s4, a4, 8 +; RV32I-NEXT: srli s5, a3, 24 +; RV32I-NEXT: srli s6, a3, 16 +; RV32I-NEXT: srli s7, a3, 8 +; RV32I-NEXT: srli s8, a1, 24 +; RV32I-NEXT: srli s9, a1, 16 +; RV32I-NEXT: sb a7, 24(a2) ; RV32I-NEXT: sb t2, 25(a2) ; RV32I-NEXT: sb t1, 26(a2) -; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a7, 24 -; RV32I-NEXT: srli t0, a7, 16 -; RV32I-NEXT: srli t1, a7, 8 -; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t1, 17(a2) -; RV32I-NEXT: sb t0, 18(a2) -; RV32I-NEXT: sb a6, 19(a2) -; RV32I-NEXT: srli a6, a5, 24 -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: srli t0, a5, 8 +; RV32I-NEXT: sb t0, 27(a2) +; RV32I-NEXT: srli a7, a1, 8 +; RV32I-NEXT: sb a6, 16(a2) +; RV32I-NEXT: sb t5, 17(a2) +; RV32I-NEXT: sb t4, 18(a2) +; RV32I-NEXT: sb t3, 19(a2) +; RV32I-NEXT: srli a6, a0, 24 ; RV32I-NEXT: sb a5, 20(a2) -; RV32I-NEXT: sb t0, 21(a2) -; RV32I-NEXT: sb a7, 22(a2) -; RV32I-NEXT: sb a6, 23(a2) -; RV32I-NEXT: srli a5, a4, 24 -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: srli a7, a4, 8 +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: sb s0, 22(a2) +; RV32I-NEXT: sb t6, 23(a2) +; RV32I-NEXT: srli a5, a0, 16 ; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a7, 9(a2) -; RV32I-NEXT: sb a6, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: sb s4, 9(a2) +; RV32I-NEXT: sb s3, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: srli a4, a0, 8 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a6, 13(a2) -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: srli a4, a1, 16 -; RV32I-NEXT: srli a5, a1, 8 +; RV32I-NEXT: sb s7, 13(a2) +; RV32I-NEXT: sb s6, 14(a2) +; RV32I-NEXT: sb s5, 15(a2) ; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: sb a4, 2(a2) -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: srli a4, a0, 8 +; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: sb s9, 2(a2) +; RV32I-NEXT: sb s8, 3(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb a6, 7(a2) +; RV32I-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 116(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 112(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 104(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 96(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index 572b74cc2499f..c0cbbb3ff9389 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -247,13 +247,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a5, a1, a3 ; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: xor a3, a1, a3 ; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: not a3, a3 ; RV32-NEXT: add a5, a5, a0 -; RV32-NEXT: xor a0, a1, a5 -; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: not a1, a1 -; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: xor a1, a1, a5 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a0, a1, 0 ; RV32-NEXT: sw a2, 0(a4) ; RV32-NEXT: sw a5, 4(a4) ; RV32-NEXT: ret @@ -271,13 +271,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a5, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: xor a3, a1, a3 ; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: not a3, a3 ; RV32ZBA-NEXT: add a5, a5, a0 -; RV32ZBA-NEXT: xor a0, a1, a5 -; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: not a1, a1 -; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: xor a1, a1, a5 +; RV32ZBA-NEXT: and a1, a3, a1 +; RV32ZBA-NEXT: slti a0, a1, 0 ; RV32ZBA-NEXT: sw a2, 0(a4) ; RV32ZBA-NEXT: sw a5, 4(a4) ; RV32ZBA-NEXT: ret @@ -295,13 +295,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: add a5, a1, a3 ; RV32ZICOND-NEXT: add a2, a0, a2 +; RV32ZICOND-NEXT: xor a3, a1, a3 ; RV32ZICOND-NEXT: sltu a0, a2, a0 +; RV32ZICOND-NEXT: not a3, a3 ; RV32ZICOND-NEXT: add a5, a5, a0 -; RV32ZICOND-NEXT: xor a0, a1, a5 -; RV32ZICOND-NEXT: xor a1, a1, a3 -; RV32ZICOND-NEXT: not a1, a1 -; RV32ZICOND-NEXT: and a0, a1, a0 -; RV32ZICOND-NEXT: slti a0, a0, 0 +; RV32ZICOND-NEXT: xor a1, a1, a5 +; RV32ZICOND-NEXT: and a1, a3, a1 +; RV32ZICOND-NEXT: slti a0, a1, 0 ; RV32ZICOND-NEXT: sw a2, 0(a4) ; RV32ZICOND-NEXT: sw a5, 4(a4) ; RV32ZICOND-NEXT: ret @@ -326,14 +326,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) { ; RV32-LABEL: saddo2.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi a3, a0, 4 +; RV32-NEXT: not a4, a1 ; RV32-NEXT: sltu a0, a3, a0 -; RV32-NEXT: add a4, a1, a0 -; RV32-NEXT: xor a0, a1, a4 -; RV32-NEXT: not a1, a1 -; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: add a5, a1, a0 +; RV32-NEXT: xor a1, a1, a5 +; RV32-NEXT: and a1, a4, a1 +; RV32-NEXT: slti a0, a1, 0 ; RV32-NEXT: sw a3, 0(a2) -; RV32-NEXT: sw a4, 4(a2) +; RV32-NEXT: sw a5, 4(a2) ; RV32-NEXT: ret ; ; RV64-LABEL: saddo2.i64: @@ -346,14 +346,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) { ; RV32ZBA-LABEL: saddo2.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: addi a3, a0, 4 +; RV32ZBA-NEXT: not a4, a1 ; RV32ZBA-NEXT: sltu a0, a3, a0 -; RV32ZBA-NEXT: add a4, a1, a0 -; RV32ZBA-NEXT: xor a0, a1, a4 -; RV32ZBA-NEXT: not a1, a1 -; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: add a5, a1, a0 +; RV32ZBA-NEXT: xor a1, a1, a5 +; RV32ZBA-NEXT: and a1, a4, a1 +; RV32ZBA-NEXT: slti a0, a1, 0 ; RV32ZBA-NEXT: sw a3, 0(a2) -; RV32ZBA-NEXT: sw a4, 4(a2) +; RV32ZBA-NEXT: sw a5, 4(a2) ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: saddo2.i64: @@ -366,14 +366,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) { ; RV32ZICOND-LABEL: saddo2.i64: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: addi a3, a0, 4 +; RV32ZICOND-NEXT: not a4, a1 ; RV32ZICOND-NEXT: sltu a0, a3, a0 -; RV32ZICOND-NEXT: add a4, a1, a0 -; RV32ZICOND-NEXT: xor a0, a1, a4 -; RV32ZICOND-NEXT: not a1, a1 -; RV32ZICOND-NEXT: and a0, a1, a0 -; RV32ZICOND-NEXT: slti a0, a0, 0 +; RV32ZICOND-NEXT: add a5, a1, a0 +; RV32ZICOND-NEXT: xor a1, a1, a5 +; RV32ZICOND-NEXT: and a1, a4, a1 +; RV32ZICOND-NEXT: slti a0, a1, 0 ; RV32ZICOND-NEXT: sw a3, 0(a2) -; RV32ZICOND-NEXT: sw a4, 4(a2) +; RV32ZICOND-NEXT: sw a5, 4(a2) ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: saddo2.i64: @@ -862,15 +862,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a5, a0, a2 ; RV32-NEXT: sub a6, a1, a3 +; RV32-NEXT: xor a3, a1, a3 +; RV32-NEXT: sub a2, a0, a2 ; RV32-NEXT: sub a5, a6, a5 -; RV32-NEXT: xor a6, a1, a5 -; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: and a1, a1, a6 -; RV32-NEXT: slti a1, a1, 0 -; RV32-NEXT: sub a0, a0, a2 -; RV32-NEXT: sw a0, 0(a4) +; RV32-NEXT: xor a1, a1, a5 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a0, a1, 0 +; RV32-NEXT: sw a2, 0(a4) ; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: ssubo.i64: @@ -886,15 +885,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a5, a0, a2 ; RV32ZBA-NEXT: sub a6, a1, a3 +; RV32ZBA-NEXT: xor a3, a1, a3 +; RV32ZBA-NEXT: sub a2, a0, a2 ; RV32ZBA-NEXT: sub a5, a6, a5 -; RV32ZBA-NEXT: xor a6, a1, a5 -; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: and a1, a1, a6 -; RV32ZBA-NEXT: slti a1, a1, 0 -; RV32ZBA-NEXT: sub a0, a0, a2 -; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: xor a1, a1, a5 +; RV32ZBA-NEXT: and a1, a3, a1 +; RV32ZBA-NEXT: slti a0, a1, 0 +; RV32ZBA-NEXT: sw a2, 0(a4) ; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: ssubo.i64: @@ -910,15 +908,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: sltu a5, a0, a2 ; RV32ZICOND-NEXT: sub a6, a1, a3 +; RV32ZICOND-NEXT: xor a3, a1, a3 +; RV32ZICOND-NEXT: sub a2, a0, a2 ; RV32ZICOND-NEXT: sub a5, a6, a5 -; RV32ZICOND-NEXT: xor a6, a1, a5 -; RV32ZICOND-NEXT: xor a1, a1, a3 -; RV32ZICOND-NEXT: and a1, a1, a6 -; RV32ZICOND-NEXT: slti a1, a1, 0 -; RV32ZICOND-NEXT: sub a0, a0, a2 -; RV32ZICOND-NEXT: sw a0, 0(a4) +; RV32ZICOND-NEXT: xor a1, a1, a5 +; RV32ZICOND-NEXT: and a1, a3, a1 +; RV32ZICOND-NEXT: slti a0, a1, 0 +; RV32ZICOND-NEXT: sw a2, 0(a4) ; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: ssubo.i64: @@ -1152,12 +1149,12 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: sltu a5, a0, a2 ; RV32ZICOND-NEXT: sub a3, a1, a3 +; RV32ZICOND-NEXT: sub a2, a0, a2 ; RV32ZICOND-NEXT: sub a3, a3, a5 +; RV32ZICOND-NEXT: sltu a0, a0, a2 ; RV32ZICOND-NEXT: xor a5, a3, a1 ; RV32ZICOND-NEXT: sltu a1, a1, a3 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a5 -; RV32ZICOND-NEXT: sub a2, a0, a2 -; RV32ZICOND-NEXT: sltu a0, a0, a2 ; RV32ZICOND-NEXT: czero.nez a0, a0, a5 ; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: sw a2, 0(a4) @@ -1268,8 +1265,8 @@ define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) { ; RV32ZBA-LABEL: smulo2.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a2, 13 -; RV32ZBA-NEXT: mulh a2, a0, a2 ; RV32ZBA-NEXT: sh1add a3, a0, a0 +; RV32ZBA-NEXT: mulh a2, a0, a2 ; RV32ZBA-NEXT: sh2add a3, a3, a0 ; RV32ZBA-NEXT: srai a0, a3, 31 ; RV32ZBA-NEXT: xor a0, a2, a0 @@ -1324,54 +1321,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: mulhu a5, a0, a2 -; RV32-NEXT: mul a6, a1, a2 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: sltu a6, a5, a6 -; RV32-NEXT: mulhu a7, a1, a2 -; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: mul a7, a0, a3 -; RV32-NEXT: add a5, a7, a5 -; RV32-NEXT: sltu a7, a5, a7 -; RV32-NEXT: mulhu t0, a0, a3 -; RV32-NEXT: add a7, t0, a7 -; RV32-NEXT: add a7, a6, a7 -; RV32-NEXT: mul t0, a1, a3 -; RV32-NEXT: add t1, t0, a7 -; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, a2, t2 +; RV32-NEXT: mulhu a6, a0, a2 +; RV32-NEXT: mul a7, a1, a2 +; RV32-NEXT: mulhu t0, a1, a2 +; RV32-NEXT: mul t1, a0, a3 +; RV32-NEXT: mulhu t2, a0, a3 +; RV32-NEXT: mul a5, a1, a3 +; RV32-NEXT: srai t3, a1, 31 ; RV32-NEXT: srai t4, a3, 31 -; RV32-NEXT: mul t5, t4, a0 -; RV32-NEXT: add t6, t5, t3 -; RV32-NEXT: add s0, t1, t6 -; RV32-NEXT: sltu s1, s0, t1 -; RV32-NEXT: sltu t0, t1, t0 -; RV32-NEXT: sltu a6, a7, a6 -; RV32-NEXT: mulhu a7, a1, a3 +; RV32-NEXT: mulhu t5, a1, a3 +; RV32-NEXT: mul t6, a0, a2 ; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: add a6, a6, t0 -; RV32-NEXT: mulhu a7, a2, t2 -; RV32-NEXT: add a7, a7, t3 -; RV32-NEXT: mul a3, a3, t2 -; RV32-NEXT: add a3, a7, a3 +; RV32-NEXT: mul s0, a2, t3 +; RV32-NEXT: mul s1, t4, a0 +; RV32-NEXT: mulhu a2, a2, t3 +; RV32-NEXT: mul a3, a3, t3 ; RV32-NEXT: mul a1, t4, a1 -; RV32-NEXT: mulhu a7, t4, a0 +; RV32-NEXT: mulhu a0, t4, a0 +; RV32-NEXT: sltu a7, a6, a7 +; RV32-NEXT: add a6, t1, a6 +; RV32-NEXT: add t3, s1, s0 +; RV32-NEXT: add a2, a2, s0 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a7, t0, a7 +; RV32-NEXT: sltu a1, a6, t1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, s1 +; RV32-NEXT: sltu a3, t3, s1 +; RV32-NEXT: srai t0, a6, 31 +; RV32-NEXT: add a1, t2, a1 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add a1, a7, a1 -; RV32-NEXT: add a1, a1, t5 -; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: sltu a3, t6, t5 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a2, a5, a1 +; RV32-NEXT: sltu a1, a1, a7 +; RV32-NEXT: add t3, a2, t3 +; RV32-NEXT: sltu a3, a2, a5 +; RV32-NEXT: add a1, t5, a1 +; RV32-NEXT: sltu a2, t3, a2 ; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: add a1, a1, s1 -; RV32-NEXT: srai a3, a5, 31 -; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: xor a3, s0, a3 -; RV32-NEXT: or a1, a3, a1 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: sw a0, 0(a4) -; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: xor a3, t3, t0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: xor a0, a0, t0 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: sw t6, 0(a4) +; RV32-NEXT: sw a6, 4(a4) ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 @@ -1398,54 +1394,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: .cfi_offset s1, -8 -; RV32ZBA-NEXT: mulhu a5, a0, a2 -; RV32ZBA-NEXT: mul a6, a1, a2 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: sltu a6, a5, a6 -; RV32ZBA-NEXT: mulhu a7, a1, a2 -; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: mul a7, a0, a3 -; RV32ZBA-NEXT: add a5, a7, a5 -; RV32ZBA-NEXT: sltu a7, a5, a7 -; RV32ZBA-NEXT: mulhu t0, a0, a3 -; RV32ZBA-NEXT: add a7, t0, a7 -; RV32ZBA-NEXT: add a7, a6, a7 -; RV32ZBA-NEXT: mul t0, a1, a3 -; RV32ZBA-NEXT: add t1, t0, a7 -; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, a2, t2 +; RV32ZBA-NEXT: mulhu a6, a0, a2 +; RV32ZBA-NEXT: mul a7, a1, a2 +; RV32ZBA-NEXT: mulhu t0, a1, a2 +; RV32ZBA-NEXT: mul t1, a0, a3 +; RV32ZBA-NEXT: mulhu t2, a0, a3 +; RV32ZBA-NEXT: mul a5, a1, a3 +; RV32ZBA-NEXT: srai t3, a1, 31 ; RV32ZBA-NEXT: srai t4, a3, 31 -; RV32ZBA-NEXT: mul t5, t4, a0 -; RV32ZBA-NEXT: add t6, t5, t3 -; RV32ZBA-NEXT: add s0, t1, t6 -; RV32ZBA-NEXT: sltu s1, s0, t1 -; RV32ZBA-NEXT: sltu t0, t1, t0 -; RV32ZBA-NEXT: sltu a6, a7, a6 -; RV32ZBA-NEXT: mulhu a7, a1, a3 +; RV32ZBA-NEXT: mulhu t5, a1, a3 +; RV32ZBA-NEXT: mul t6, a0, a2 ; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: add a6, a6, t0 -; RV32ZBA-NEXT: mulhu a7, a2, t2 -; RV32ZBA-NEXT: add a7, a7, t3 -; RV32ZBA-NEXT: mul a3, a3, t2 -; RV32ZBA-NEXT: add a3, a7, a3 +; RV32ZBA-NEXT: mul s0, a2, t3 +; RV32ZBA-NEXT: mul s1, t4, a0 +; RV32ZBA-NEXT: mulhu a2, a2, t3 +; RV32ZBA-NEXT: mul a3, a3, t3 ; RV32ZBA-NEXT: mul a1, t4, a1 -; RV32ZBA-NEXT: mulhu a7, t4, a0 +; RV32ZBA-NEXT: mulhu a0, t4, a0 +; RV32ZBA-NEXT: sltu a7, a6, a7 +; RV32ZBA-NEXT: add a6, t1, a6 +; RV32ZBA-NEXT: add t3, s1, s0 +; RV32ZBA-NEXT: add a2, a2, s0 +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: add a7, t0, a7 +; RV32ZBA-NEXT: sltu a1, a6, t1 +; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: add a0, a0, s1 +; RV32ZBA-NEXT: sltu a3, t3, s1 +; RV32ZBA-NEXT: srai t0, a6, 31 +; RV32ZBA-NEXT: add a1, t2, a1 +; RV32ZBA-NEXT: add a0, a0, a2 ; RV32ZBA-NEXT: add a1, a7, a1 -; RV32ZBA-NEXT: add a1, a1, t5 -; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: sltu a3, t6, t5 +; RV32ZBA-NEXT: add a0, a0, a3 +; RV32ZBA-NEXT: add a2, a5, a1 +; RV32ZBA-NEXT: sltu a1, a1, a7 +; RV32ZBA-NEXT: add t3, a2, t3 +; RV32ZBA-NEXT: sltu a3, a2, a5 +; RV32ZBA-NEXT: add a1, t5, a1 +; RV32ZBA-NEXT: sltu a2, t3, a2 ; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: add a1, a6, a1 -; RV32ZBA-NEXT: add a1, a1, s1 -; RV32ZBA-NEXT: srai a3, a5, 31 -; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: xor a3, s0, a3 -; RV32ZBA-NEXT: or a1, a3, a1 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: mul a0, a0, a2 -; RV32ZBA-NEXT: sw a0, 0(a4) -; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: xor a3, t3, t0 +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: xor a0, a0, t0 +; RV32ZBA-NEXT: or a0, a3, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: sw t6, 0(a4) +; RV32ZBA-NEXT: sw a6, 4(a4) ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: .cfi_restore s0 @@ -1472,54 +1467,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZICOND-NEXT: .cfi_offset s0, -4 ; RV32ZICOND-NEXT: .cfi_offset s1, -8 -; RV32ZICOND-NEXT: mulhu a5, a0, a2 -; RV32ZICOND-NEXT: mul a6, a1, a2 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: sltu a6, a5, a6 -; RV32ZICOND-NEXT: mulhu a7, a1, a2 -; RV32ZICOND-NEXT: add a6, a7, a6 -; RV32ZICOND-NEXT: mul a7, a0, a3 -; RV32ZICOND-NEXT: add a5, a7, a5 -; RV32ZICOND-NEXT: sltu a7, a5, a7 -; RV32ZICOND-NEXT: mulhu t0, a0, a3 -; RV32ZICOND-NEXT: add a7, t0, a7 -; RV32ZICOND-NEXT: add a7, a6, a7 -; RV32ZICOND-NEXT: mul t0, a1, a3 -; RV32ZICOND-NEXT: add t1, t0, a7 -; RV32ZICOND-NEXT: srai t2, a1, 31 -; RV32ZICOND-NEXT: mul t3, a2, t2 +; RV32ZICOND-NEXT: mulhu a6, a0, a2 +; RV32ZICOND-NEXT: mul a7, a1, a2 +; RV32ZICOND-NEXT: mulhu t0, a1, a2 +; RV32ZICOND-NEXT: mul t1, a0, a3 +; RV32ZICOND-NEXT: mulhu t2, a0, a3 +; RV32ZICOND-NEXT: mul a5, a1, a3 +; RV32ZICOND-NEXT: srai t3, a1, 31 ; RV32ZICOND-NEXT: srai t4, a3, 31 -; RV32ZICOND-NEXT: mul t5, t4, a0 -; RV32ZICOND-NEXT: add t6, t5, t3 -; RV32ZICOND-NEXT: add s0, t1, t6 -; RV32ZICOND-NEXT: sltu s1, s0, t1 -; RV32ZICOND-NEXT: sltu t0, t1, t0 -; RV32ZICOND-NEXT: sltu a6, a7, a6 -; RV32ZICOND-NEXT: mulhu a7, a1, a3 +; RV32ZICOND-NEXT: mulhu t5, a1, a3 +; RV32ZICOND-NEXT: mul t6, a0, a2 ; RV32ZICOND-NEXT: add a6, a7, a6 -; RV32ZICOND-NEXT: add a6, a6, t0 -; RV32ZICOND-NEXT: mulhu a7, a2, t2 -; RV32ZICOND-NEXT: add a7, a7, t3 -; RV32ZICOND-NEXT: mul a3, a3, t2 -; RV32ZICOND-NEXT: add a3, a7, a3 +; RV32ZICOND-NEXT: mul s0, a2, t3 +; RV32ZICOND-NEXT: mul s1, t4, a0 +; RV32ZICOND-NEXT: mulhu a2, a2, t3 +; RV32ZICOND-NEXT: mul a3, a3, t3 ; RV32ZICOND-NEXT: mul a1, t4, a1 -; RV32ZICOND-NEXT: mulhu a7, t4, a0 +; RV32ZICOND-NEXT: mulhu a0, t4, a0 +; RV32ZICOND-NEXT: sltu a7, a6, a7 +; RV32ZICOND-NEXT: add a6, t1, a6 +; RV32ZICOND-NEXT: add t3, s1, s0 +; RV32ZICOND-NEXT: add a2, a2, s0 +; RV32ZICOND-NEXT: add a0, a0, a1 +; RV32ZICOND-NEXT: add a7, t0, a7 +; RV32ZICOND-NEXT: sltu a1, a6, t1 +; RV32ZICOND-NEXT: add a2, a2, a3 +; RV32ZICOND-NEXT: add a0, a0, s1 +; RV32ZICOND-NEXT: sltu a3, t3, s1 +; RV32ZICOND-NEXT: srai t0, a6, 31 +; RV32ZICOND-NEXT: add a1, t2, a1 +; RV32ZICOND-NEXT: add a0, a0, a2 ; RV32ZICOND-NEXT: add a1, a7, a1 -; RV32ZICOND-NEXT: add a1, a1, t5 +; RV32ZICOND-NEXT: add a0, a0, a3 +; RV32ZICOND-NEXT: add a2, a5, a1 +; RV32ZICOND-NEXT: sltu a1, a1, a7 +; RV32ZICOND-NEXT: add t3, a2, t3 +; RV32ZICOND-NEXT: sltu a3, a2, a5 +; RV32ZICOND-NEXT: add a1, t5, a1 +; RV32ZICOND-NEXT: sltu a2, t3, a2 ; RV32ZICOND-NEXT: add a1, a1, a3 -; RV32ZICOND-NEXT: sltu a3, t6, t5 -; RV32ZICOND-NEXT: add a1, a1, a3 -; RV32ZICOND-NEXT: add a1, a6, a1 -; RV32ZICOND-NEXT: add a1, a1, s1 -; RV32ZICOND-NEXT: srai a3, a5, 31 -; RV32ZICOND-NEXT: xor a1, a1, a3 -; RV32ZICOND-NEXT: xor a3, s0, a3 -; RV32ZICOND-NEXT: or a1, a3, a1 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: mul a0, a0, a2 -; RV32ZICOND-NEXT: sw a0, 0(a4) -; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 +; RV32ZICOND-NEXT: xor a3, t3, t0 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: add a0, a0, a2 +; RV32ZICOND-NEXT: xor a0, a0, t0 +; RV32ZICOND-NEXT: or a0, a3, a0 +; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: sw t6, 0(a4) +; RV32ZICOND-NEXT: sw a6, 4(a4) ; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZICOND-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZICOND-NEXT: .cfi_restore s0 @@ -1549,27 +1543,26 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32-LABEL: smulo2.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a3, 13 -; RV32-NEXT: mulhu a4, a0, a3 -; RV32-NEXT: mul a5, a1, a3 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a5, a4, a5 -; RV32-NEXT: mulhu a6, a1, a3 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: srai a1, a1, 31 +; RV32-NEXT: srai a4, a1, 31 +; RV32-NEXT: mulhu a5, a0, a3 ; RV32-NEXT: mul a6, a1, a3 -; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: srai a7, a4, 31 -; RV32-NEXT: xor t0, a6, a7 -; RV32-NEXT: sltu a5, a6, a5 -; RV32-NEXT: mulh a1, a1, a3 -; RV32-NEXT: add a1, a1, a5 -; RV32-NEXT: xor a1, a1, a7 -; RV32-NEXT: or a1, t0, a1 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: sw a0, 0(a2) -; RV32-NEXT: sw a4, 4(a2) -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mulhu a1, a1, a3 +; RV32-NEXT: mul a7, a4, a3 +; RV32-NEXT: mulh a4, a4, a3 +; RV32-NEXT: mul a3, a0, a3 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a0, a5, a6 +; RV32-NEXT: srai a6, a5, 31 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a7, a0, a7 +; RV32-NEXT: xor a1, a7, a6 +; RV32-NEXT: sltu a0, a7, a0 +; RV32-NEXT: add a0, a4, a0 +; RV32-NEXT: xor a0, a0, a6 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: sw a3, 0(a2) +; RV32-NEXT: sw a5, 4(a2) ; RV32-NEXT: ret ; ; RV64-LABEL: smulo2.i64: @@ -1586,37 +1579,36 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32ZBA-LABEL: smulo2.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a3, 13 -; RV32ZBA-NEXT: mulhu a4, a0, a3 -; RV32ZBA-NEXT: sh1add a5, a1, a1 -; RV32ZBA-NEXT: sh2add a5, a5, a1 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a5, a4, a5 -; RV32ZBA-NEXT: mulhu a6, a1, a3 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: srai a1, a1, 31 -; RV32ZBA-NEXT: sh1add a6, a1, a1 -; RV32ZBA-NEXT: sh2add a6, a6, a1 -; RV32ZBA-NEXT: add a6, a5, a6 -; RV32ZBA-NEXT: srai a7, a4, 31 -; RV32ZBA-NEXT: xor t0, a6, a7 -; RV32ZBA-NEXT: sltu a5, a6, a5 -; RV32ZBA-NEXT: mulh a1, a1, a3 -; RV32ZBA-NEXT: add a1, a1, a5 -; RV32ZBA-NEXT: xor a1, a1, a7 -; RV32ZBA-NEXT: or a1, t0, a1 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: sh1add a3, a0, a0 -; RV32ZBA-NEXT: sh2add a0, a3, a0 -; RV32ZBA-NEXT: sw a0, 0(a2) -; RV32ZBA-NEXT: sw a4, 4(a2) -; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: sh1add a4, a1, a1 +; RV32ZBA-NEXT: srai a5, a1, 31 +; RV32ZBA-NEXT: sh1add a6, a0, a0 +; RV32ZBA-NEXT: mulhu a7, a0, a3 +; RV32ZBA-NEXT: sh2add a4, a4, a1 +; RV32ZBA-NEXT: mulhu a1, a1, a3 +; RV32ZBA-NEXT: sh1add t0, a5, a5 +; RV32ZBA-NEXT: mulh a3, a5, a3 +; RV32ZBA-NEXT: sh2add a6, a6, a0 +; RV32ZBA-NEXT: add a7, a4, a7 +; RV32ZBA-NEXT: sh2add a0, t0, a5 +; RV32ZBA-NEXT: sltu a4, a7, a4 +; RV32ZBA-NEXT: srai a5, a7, 31 +; RV32ZBA-NEXT: add a1, a1, a4 +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: xor a4, a0, a5 +; RV32ZBA-NEXT: sltu a0, a0, a1 +; RV32ZBA-NEXT: add a0, a3, a0 +; RV32ZBA-NEXT: xor a0, a0, a5 +; RV32ZBA-NEXT: or a0, a4, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: sw a6, 0(a2) +; RV32ZBA-NEXT: sw a7, 4(a2) ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: smulo2.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: li a2, 13 -; RV64ZBA-NEXT: mulh a2, a0, a2 ; RV64ZBA-NEXT: sh1add a3, a0, a0 +; RV64ZBA-NEXT: mulh a2, a0, a2 ; RV64ZBA-NEXT: sh2add a3, a3, a0 ; RV64ZBA-NEXT: srai a0, a3, 63 ; RV64ZBA-NEXT: xor a0, a2, a0 @@ -1627,27 +1619,26 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32ZICOND-LABEL: smulo2.i64: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: li a3, 13 -; RV32ZICOND-NEXT: mulhu a4, a0, a3 -; RV32ZICOND-NEXT: mul a5, a1, a3 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a5, a4, a5 -; RV32ZICOND-NEXT: mulhu a6, a1, a3 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: srai a1, a1, 31 +; RV32ZICOND-NEXT: srai a4, a1, 31 +; RV32ZICOND-NEXT: mulhu a5, a0, a3 ; RV32ZICOND-NEXT: mul a6, a1, a3 -; RV32ZICOND-NEXT: add a6, a5, a6 -; RV32ZICOND-NEXT: srai a7, a4, 31 -; RV32ZICOND-NEXT: xor t0, a6, a7 -; RV32ZICOND-NEXT: sltu a5, a6, a5 -; RV32ZICOND-NEXT: mulh a1, a1, a3 -; RV32ZICOND-NEXT: add a1, a1, a5 -; RV32ZICOND-NEXT: xor a1, a1, a7 -; RV32ZICOND-NEXT: or a1, t0, a1 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: mul a0, a0, a3 -; RV32ZICOND-NEXT: sw a0, 0(a2) -; RV32ZICOND-NEXT: sw a4, 4(a2) -; RV32ZICOND-NEXT: mv a0, a1 +; RV32ZICOND-NEXT: mulhu a1, a1, a3 +; RV32ZICOND-NEXT: mul a7, a4, a3 +; RV32ZICOND-NEXT: mulh a4, a4, a3 +; RV32ZICOND-NEXT: mul a3, a0, a3 +; RV32ZICOND-NEXT: add a5, a6, a5 +; RV32ZICOND-NEXT: sltu a0, a5, a6 +; RV32ZICOND-NEXT: srai a6, a5, 31 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: add a7, a0, a7 +; RV32ZICOND-NEXT: xor a1, a7, a6 +; RV32ZICOND-NEXT: sltu a0, a7, a0 +; RV32ZICOND-NEXT: add a0, a4, a0 +; RV32ZICOND-NEXT: xor a0, a0, a6 +; RV32ZICOND-NEXT: or a0, a1, a0 +; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: sw a3, 0(a2) +; RV32ZICOND-NEXT: sw a5, 4(a2) ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: smulo2.i64: @@ -1758,9 +1749,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { ; RV32ZBA-LABEL: umulo2.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a2, 13 +; RV32ZBA-NEXT: sh1add a3, a0, a0 ; RV32ZBA-NEXT: mulhu a2, a0, a2 ; RV32ZBA-NEXT: snez a2, a2 -; RV32ZBA-NEXT: sh1add a3, a0, a0 ; RV32ZBA-NEXT: sh2add a0, a3, a0 ; RV32ZBA-NEXT: sw a0, 0(a1) ; RV32ZBA-NEXT: mv a0, a2 @@ -1878,24 +1869,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a5, a3, a0 ; RV32-NEXT: mul a6, a1, a2 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: mulhu a6, a0, a2 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: sltu a6, a5, a6 -; RV32-NEXT: snez a7, a3 -; RV32-NEXT: snez t0, a1 -; RV32-NEXT: and a7, t0, a7 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: or a1, a7, a1 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: snez t0, a3 ; RV32-NEXT: mulhu a3, a3, a0 -; RV32-NEXT: snez a3, a3 -; RV32-NEXT: or a1, a1, a3 -; RV32-NEXT: or a1, a1, a6 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: sw a0, 0(a4) +; RV32-NEXT: mul t1, a0, a2 +; RV32-NEXT: mulhu a0, a1, a2 +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: and a1, a1, t0 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: snez a2, a3 +; RV32-NEXT: add a5, a7, a5 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sltu a1, a5, a7 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: sw t1, 0(a4) ; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.i64: @@ -1911,24 +1901,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a5, a3, a0 ; RV32ZBA-NEXT: mul a6, a1, a2 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: mulhu a6, a0, a2 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: sltu a6, a5, a6 -; RV32ZBA-NEXT: snez a7, a3 -; RV32ZBA-NEXT: snez t0, a1 -; RV32ZBA-NEXT: and a7, t0, a7 -; RV32ZBA-NEXT: mulhu a1, a1, a2 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: or a1, a7, a1 +; RV32ZBA-NEXT: mulhu a7, a0, a2 +; RV32ZBA-NEXT: snez t0, a3 ; RV32ZBA-NEXT: mulhu a3, a3, a0 -; RV32ZBA-NEXT: snez a3, a3 -; RV32ZBA-NEXT: or a1, a1, a3 -; RV32ZBA-NEXT: or a1, a1, a6 -; RV32ZBA-NEXT: mul a0, a0, a2 -; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: mul t1, a0, a2 +; RV32ZBA-NEXT: mulhu a0, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: add a5, a6, a5 +; RV32ZBA-NEXT: and a1, a1, t0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: snez a2, a3 +; RV32ZBA-NEXT: add a5, a7, a5 +; RV32ZBA-NEXT: or a0, a1, a0 +; RV32ZBA-NEXT: sltu a1, a5, a7 +; RV32ZBA-NEXT: or a0, a0, a2 +; RV32ZBA-NEXT: or a0, a0, a1 +; RV32ZBA-NEXT: sw t1, 0(a4) ; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.i64: @@ -1944,24 +1933,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a5, a3, a0 ; RV32ZICOND-NEXT: mul a6, a1, a2 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a6, a0, a2 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: sltu a6, a5, a6 -; RV32ZICOND-NEXT: snez a7, a3 -; RV32ZICOND-NEXT: snez t0, a1 -; RV32ZICOND-NEXT: and a7, t0, a7 -; RV32ZICOND-NEXT: mulhu a1, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: or a1, a7, a1 +; RV32ZICOND-NEXT: mulhu a7, a0, a2 +; RV32ZICOND-NEXT: snez t0, a3 ; RV32ZICOND-NEXT: mulhu a3, a3, a0 -; RV32ZICOND-NEXT: snez a3, a3 -; RV32ZICOND-NEXT: or a1, a1, a3 -; RV32ZICOND-NEXT: or a1, a1, a6 -; RV32ZICOND-NEXT: mul a0, a0, a2 -; RV32ZICOND-NEXT: sw a0, 0(a4) +; RV32ZICOND-NEXT: mul t1, a0, a2 +; RV32ZICOND-NEXT: mulhu a0, a1, a2 +; RV32ZICOND-NEXT: snez a1, a1 +; RV32ZICOND-NEXT: add a5, a6, a5 +; RV32ZICOND-NEXT: and a1, a1, t0 +; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: snez a2, a3 +; RV32ZICOND-NEXT: add a5, a7, a5 +; RV32ZICOND-NEXT: or a0, a1, a0 +; RV32ZICOND-NEXT: sltu a1, a5, a7 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: sw t1, 0(a4) ; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo.i64: @@ -1986,15 +1974,14 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a4, a1, a3 ; RV32-NEXT: mulhu a5, a0, a3 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a5, a4, a5 ; RV32-NEXT: mulhu a1, a1, a3 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: or a1, a1, a5 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: mul a3, a0, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: snez a0, a1 +; RV32-NEXT: sltu a1, a4, a5 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: sw a3, 0(a2) ; RV32-NEXT: sw a4, 4(a2) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo2.i64: @@ -2010,27 +1997,26 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32ZBA-LABEL: umulo2.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a3, 13 -; RV32ZBA-NEXT: mulhu a4, a0, a3 -; RV32ZBA-NEXT: sh1add a5, a1, a1 -; RV32ZBA-NEXT: sh2add a5, a5, a1 -; RV32ZBA-NEXT: add a5, a4, a5 -; RV32ZBA-NEXT: sltu a4, a5, a4 +; RV32ZBA-NEXT: sh1add a4, a1, a1 +; RV32ZBA-NEXT: sh1add a5, a0, a0 +; RV32ZBA-NEXT: sh2add a4, a4, a1 ; RV32ZBA-NEXT: mulhu a1, a1, a3 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: or a1, a1, a4 -; RV32ZBA-NEXT: sh1add a3, a0, a0 -; RV32ZBA-NEXT: sh2add a0, a3, a0 -; RV32ZBA-NEXT: sw a0, 0(a2) -; RV32ZBA-NEXT: sw a5, 4(a2) -; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: mulhu a3, a0, a3 +; RV32ZBA-NEXT: sh2add a5, a5, a0 +; RV32ZBA-NEXT: add a4, a3, a4 +; RV32ZBA-NEXT: snez a0, a1 +; RV32ZBA-NEXT: sltu a1, a4, a3 +; RV32ZBA-NEXT: or a0, a0, a1 +; RV32ZBA-NEXT: sw a5, 0(a2) +; RV32ZBA-NEXT: sw a4, 4(a2) ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo2.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: li a2, 13 +; RV64ZBA-NEXT: sh1add a3, a0, a0 ; RV64ZBA-NEXT: mulhu a2, a0, a2 ; RV64ZBA-NEXT: snez a2, a2 -; RV64ZBA-NEXT: sh1add a3, a0, a0 ; RV64ZBA-NEXT: sh2add a0, a3, a0 ; RV64ZBA-NEXT: sd a0, 0(a1) ; RV64ZBA-NEXT: mv a0, a2 @@ -2041,15 +2027,14 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32ZICOND-NEXT: li a3, 13 ; RV32ZICOND-NEXT: mul a4, a1, a3 ; RV32ZICOND-NEXT: mulhu a5, a0, a3 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a5, a4, a5 ; RV32ZICOND-NEXT: mulhu a1, a1, a3 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: or a1, a1, a5 -; RV32ZICOND-NEXT: mul a0, a0, a3 -; RV32ZICOND-NEXT: sw a0, 0(a2) +; RV32ZICOND-NEXT: mul a3, a0, a3 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: snez a0, a1 +; RV32ZICOND-NEXT: sltu a1, a4, a5 +; RV32ZICOND-NEXT: or a0, a0, a1 +; RV32ZICOND-NEXT: sw a3, 0(a2) ; RV32ZICOND-NEXT: sw a4, 4(a2) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo2.i64: @@ -2119,8 +2104,8 @@ define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) { ; RV32ZICOND-LABEL: saddo.select.i32: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: add a2, a0, a1 -; RV32ZICOND-NEXT: slt a2, a2, a0 ; RV32ZICOND-NEXT: slti a3, a1, 0 +; RV32ZICOND-NEXT: slt a2, a2, a0 ; RV32ZICOND-NEXT: xor a2, a3, a2 ; RV32ZICOND-NEXT: czero.nez a1, a1, a2 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a2 @@ -2208,8 +2193,8 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: add a5, a0, a2 ; RV32-NEXT: sltu a5, a5, a0 ; RV32-NEXT: add a4, a4, a5 -; RV32-NEXT: xor a4, a1, a4 ; RV32-NEXT: xor a5, a1, a3 +; RV32-NEXT: xor a4, a1, a4 ; RV32-NEXT: not a5, a5 ; RV32-NEXT: and a4, a5, a4 ; RV32-NEXT: bltz a4, .LBB30_2 @@ -2236,8 +2221,8 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: add a5, a0, a2 ; RV32ZBA-NEXT: sltu a5, a5, a0 ; RV32ZBA-NEXT: add a4, a4, a5 -; RV32ZBA-NEXT: xor a4, a1, a4 ; RV32ZBA-NEXT: xor a5, a1, a3 +; RV32ZBA-NEXT: xor a4, a1, a4 ; RV32ZBA-NEXT: not a5, a5 ; RV32ZBA-NEXT: and a4, a5, a4 ; RV32ZBA-NEXT: bltz a4, .LBB30_2 @@ -2264,24 +2249,24 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND-NEXT: add a5, a0, a2 ; RV32ZICOND-NEXT: sltu a5, a5, a0 ; RV32ZICOND-NEXT: add a4, a4, a5 -; RV32ZICOND-NEXT: xor a4, a1, a4 ; RV32ZICOND-NEXT: xor a5, a1, a3 ; RV32ZICOND-NEXT: not a5, a5 +; RV32ZICOND-NEXT: xor a4, a1, a4 ; RV32ZICOND-NEXT: and a4, a5, a4 ; RV32ZICOND-NEXT: slti a4, a4, 0 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: saddo.select.i64: ; RV64ZICOND: # %bb.0: # %entry ; RV64ZICOND-NEXT: add a2, a0, a1 -; RV64ZICOND-NEXT: slt a2, a2, a0 ; RV64ZICOND-NEXT: slti a3, a1, 0 +; RV64ZICOND-NEXT: slt a2, a2, a0 ; RV64ZICOND-NEXT: xor a2, a3, a2 ; RV64ZICOND-NEXT: czero.nez a1, a1, a2 ; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 @@ -2299,11 +2284,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a4, a1, a3 ; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: xor a3, a1, a3 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a0, a4, a0 ; RV32-NEXT: xor a0, a1, a0 -; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: not a1, a3 ; RV32-NEXT: and a0, a1, a0 ; RV32-NEXT: slti a0, a0, 0 ; RV32-NEXT: xori a0, a0, 1 @@ -2322,11 +2307,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a4, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: xor a3, a1, a3 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a0, a4, a0 ; RV32ZBA-NEXT: xor a0, a1, a0 -; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: not a1, a3 ; RV32ZBA-NEXT: and a0, a1, a0 ; RV32ZBA-NEXT: slti a0, a0, 0 ; RV32ZBA-NEXT: xori a0, a0, 1 @@ -2345,11 +2330,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: add a4, a1, a3 ; RV32ZICOND-NEXT: add a2, a0, a2 +; RV32ZICOND-NEXT: xor a3, a1, a3 ; RV32ZICOND-NEXT: sltu a0, a2, a0 ; RV32ZICOND-NEXT: add a0, a4, a0 ; RV32ZICOND-NEXT: xor a0, a1, a0 -; RV32ZICOND-NEXT: xor a1, a1, a3 -; RV32ZICOND-NEXT: not a1, a1 +; RV32ZICOND-NEXT: not a1, a3 ; RV32ZICOND-NEXT: and a0, a1, a0 ; RV32ZICOND-NEXT: slti a0, a0, 0 ; RV32ZICOND-NEXT: xori a0, a0, 1 @@ -2550,10 +2535,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND-NEXT: or a4, a5, a4 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: uaddo.select.i64: @@ -2825,16 +2810,16 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND-NEXT: sltu a4, a0, a2 ; RV32ZICOND-NEXT: sub a5, a1, a3 ; RV32ZICOND-NEXT: sub a5, a5, a4 -; RV32ZICOND-NEXT: xor a5, a1, a5 ; RV32ZICOND-NEXT: xor a4, a1, a3 +; RV32ZICOND-NEXT: xor a5, a1, a5 ; RV32ZICOND-NEXT: and a4, a4, a5 ; RV32ZICOND-NEXT: slti a4, a4, 0 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: ssubo.select.i64: @@ -3097,20 +3082,20 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: sltu a4, a0, a2 ; RV32ZICOND-NEXT: sub a5, a1, a3 +; RV32ZICOND-NEXT: sub a6, a0, a2 ; RV32ZICOND-NEXT: sub a5, a5, a4 -; RV32ZICOND-NEXT: xor a4, a5, a1 +; RV32ZICOND-NEXT: sltu a4, a0, a6 +; RV32ZICOND-NEXT: xor a6, a5, a1 ; RV32ZICOND-NEXT: sltu a5, a1, a5 -; RV32ZICOND-NEXT: czero.eqz a5, a5, a4 -; RV32ZICOND-NEXT: sub a6, a0, a2 -; RV32ZICOND-NEXT: sltu a6, a0, a6 -; RV32ZICOND-NEXT: czero.nez a4, a6, a4 +; RV32ZICOND-NEXT: czero.eqz a5, a5, a6 +; RV32ZICOND-NEXT: czero.nez a4, a4, a6 ; RV32ZICOND-NEXT: or a4, a4, a5 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: usubo.select.i64: @@ -3179,13 +3164,13 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: sltu a4, a0, a2 ; RV32ZICOND-NEXT: sub a3, a1, a3 -; RV32ZICOND-NEXT: sub a3, a3, a4 -; RV32ZICOND-NEXT: xor a4, a3, a1 -; RV32ZICOND-NEXT: sltu a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 ; RV32ZICOND-NEXT: sub a2, a0, a2 +; RV32ZICOND-NEXT: sub a3, a3, a4 ; RV32ZICOND-NEXT: sltu a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a0, a0, a4 +; RV32ZICOND-NEXT: xor a2, a3, a1 +; RV32ZICOND-NEXT: sltu a1, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a2 +; RV32ZICOND-NEXT: czero.nez a0, a0, a2 ; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: xori a0, a0, 1 ; RV32ZICOND-NEXT: ret @@ -3340,46 +3325,46 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 +; RV32-NEXT: mulhu a6, a1, a2 +; RV32-NEXT: mul a7, a0, a3 +; RV32-NEXT: mulhu t0, a0, a3 +; RV32-NEXT: mul t1, a1, a3 +; RV32-NEXT: srai t2, a1, 31 +; RV32-NEXT: srai t3, a3, 31 +; RV32-NEXT: mulhu t4, a1, a3 ; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mul t5, a2, t2 +; RV32-NEXT: mul t6, t3, a0 +; RV32-NEXT: mul s0, t3, a1 +; RV32-NEXT: mulhu t3, t3, a0 +; RV32-NEXT: add t3, t3, s0 +; RV32-NEXT: mulhu s0, a2, t2 +; RV32-NEXT: mul t2, a3, t2 ; RV32-NEXT: sltu a5, a4, a5 -; RV32-NEXT: mulhu a6, a1, a2 +; RV32-NEXT: add a4, a7, a4 +; RV32-NEXT: add s0, s0, t5 +; RV32-NEXT: add t5, t6, t5 ; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: mul a6, a0, a3 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: sltu a6, a4, a6 -; RV32-NEXT: mulhu a7, a0, a3 -; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: sltu a6, a4, a7 +; RV32-NEXT: add t2, s0, t2 +; RV32-NEXT: add t3, t3, t6 +; RV32-NEXT: sltu a7, t5, t6 +; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: add a6, t0, a6 +; RV32-NEXT: add t2, t3, t2 ; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: mul a7, a1, a3 -; RV32-NEXT: add t0, a7, a6 -; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 -; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 -; RV32-NEXT: sltu a7, t0, a7 +; RV32-NEXT: add a7, t2, a7 +; RV32-NEXT: add t0, t1, a6 ; RV32-NEXT: sltu a5, a6, a5 -; RV32-NEXT: mulhu a6, a1, a3 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: add a5, a5, a7 -; RV32-NEXT: mulhu a6, a2, t1 -; RV32-NEXT: add a6, a6, t2 -; RV32-NEXT: mul a7, a3, t1 -; RV32-NEXT: add a6, a6, a7 -; RV32-NEXT: mul a7, t3, a1 -; RV32-NEXT: mulhu t0, t3, a0 -; RV32-NEXT: add a7, t0, a7 -; RV32-NEXT: add a7, a7, t4 -; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: sltu a7, t5, t4 -; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: add t5, t0, t5 +; RV32-NEXT: sltu a6, t0, t1 +; RV32-NEXT: add a5, t4, a5 +; RV32-NEXT: sltu t0, t5, t0 ; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a5, a5, s0 -; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: add a5, a5, a7 +; RV32-NEXT: add a5, a5, t0 ; RV32-NEXT: xor a5, a5, a4 -; RV32-NEXT: xor a4, t6, a4 +; RV32-NEXT: xor a4, t5, a4 ; RV32-NEXT: or a4, a4, a5 ; RV32-NEXT: bnez a4, .LBB46_2 ; RV32-NEXT: # %bb.1: # %entry @@ -3411,46 +3396,46 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: mulhu a6, a1, a2 +; RV32ZBA-NEXT: mul a7, a0, a3 +; RV32ZBA-NEXT: mulhu t0, a0, a3 +; RV32ZBA-NEXT: mul t1, a1, a3 +; RV32ZBA-NEXT: srai t2, a1, 31 +; RV32ZBA-NEXT: srai t3, a3, 31 +; RV32ZBA-NEXT: mulhu t4, a1, a3 ; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mul t5, a2, t2 +; RV32ZBA-NEXT: mul t6, t3, a0 +; RV32ZBA-NEXT: mul s0, t3, a1 +; RV32ZBA-NEXT: mulhu t3, t3, a0 +; RV32ZBA-NEXT: add t3, t3, s0 +; RV32ZBA-NEXT: mulhu s0, a2, t2 +; RV32ZBA-NEXT: mul t2, a3, t2 ; RV32ZBA-NEXT: sltu a5, a4, a5 -; RV32ZBA-NEXT: mulhu a6, a1, a2 +; RV32ZBA-NEXT: add a4, a7, a4 +; RV32ZBA-NEXT: add s0, s0, t5 +; RV32ZBA-NEXT: add t5, t6, t5 ; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: mul a6, a0, a3 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: sltu a6, a4, a6 -; RV32ZBA-NEXT: mulhu a7, a0, a3 -; RV32ZBA-NEXT: add a6, a7, a6 +; RV32ZBA-NEXT: sltu a6, a4, a7 +; RV32ZBA-NEXT: add t2, s0, t2 +; RV32ZBA-NEXT: add t3, t3, t6 +; RV32ZBA-NEXT: sltu a7, t5, t6 +; RV32ZBA-NEXT: srai a4, a4, 31 +; RV32ZBA-NEXT: add a6, t0, a6 +; RV32ZBA-NEXT: add t2, t3, t2 ; RV32ZBA-NEXT: add a6, a5, a6 -; RV32ZBA-NEXT: mul a7, a1, a3 -; RV32ZBA-NEXT: add t0, a7, a6 -; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 -; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 -; RV32ZBA-NEXT: sltu a7, t0, a7 +; RV32ZBA-NEXT: add a7, t2, a7 +; RV32ZBA-NEXT: add t0, t1, a6 ; RV32ZBA-NEXT: sltu a5, a6, a5 -; RV32ZBA-NEXT: mulhu a6, a1, a3 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: add a5, a5, a7 -; RV32ZBA-NEXT: mulhu a6, a2, t1 -; RV32ZBA-NEXT: add a6, a6, t2 -; RV32ZBA-NEXT: mul a7, a3, t1 -; RV32ZBA-NEXT: add a6, a6, a7 -; RV32ZBA-NEXT: mul a7, t3, a1 -; RV32ZBA-NEXT: mulhu t0, t3, a0 -; RV32ZBA-NEXT: add a7, t0, a7 -; RV32ZBA-NEXT: add a7, a7, t4 -; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: sltu a7, t5, t4 -; RV32ZBA-NEXT: add a6, a6, a7 +; RV32ZBA-NEXT: add t5, t0, t5 +; RV32ZBA-NEXT: sltu a6, t0, t1 +; RV32ZBA-NEXT: add a5, t4, a5 +; RV32ZBA-NEXT: sltu t0, t5, t0 ; RV32ZBA-NEXT: add a5, a5, a6 -; RV32ZBA-NEXT: add a5, a5, s0 -; RV32ZBA-NEXT: srai a4, a4, 31 +; RV32ZBA-NEXT: add a5, a5, a7 +; RV32ZBA-NEXT: add a5, a5, t0 ; RV32ZBA-NEXT: xor a5, a5, a4 -; RV32ZBA-NEXT: xor a4, t6, a4 +; RV32ZBA-NEXT: xor a4, t5, a4 ; RV32ZBA-NEXT: or a4, a4, a5 ; RV32ZBA-NEXT: bnez a4, .LBB46_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -3482,53 +3467,53 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND-NEXT: .cfi_offset s0, -4 ; RV32ZICOND-NEXT: mulhu a4, a0, a2 ; RV32ZICOND-NEXT: mul a5, a1, a2 +; RV32ZICOND-NEXT: mulhu a6, a1, a2 +; RV32ZICOND-NEXT: mul a7, a0, a3 +; RV32ZICOND-NEXT: mulhu t0, a0, a3 +; RV32ZICOND-NEXT: mul t1, a1, a3 +; RV32ZICOND-NEXT: srai t2, a1, 31 +; RV32ZICOND-NEXT: srai t3, a3, 31 +; RV32ZICOND-NEXT: mulhu t4, a1, a3 ; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: mul t5, a2, t2 +; RV32ZICOND-NEXT: mul t6, t3, a0 +; RV32ZICOND-NEXT: mul s0, t3, a1 +; RV32ZICOND-NEXT: mulhu t3, t3, a0 +; RV32ZICOND-NEXT: add t3, t3, s0 +; RV32ZICOND-NEXT: mulhu s0, a2, t2 +; RV32ZICOND-NEXT: mul t2, a3, t2 ; RV32ZICOND-NEXT: sltu a5, a4, a5 -; RV32ZICOND-NEXT: mulhu a6, a1, a2 +; RV32ZICOND-NEXT: add a4, a7, a4 +; RV32ZICOND-NEXT: add s0, s0, t5 +; RV32ZICOND-NEXT: add t5, t6, t5 ; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: mul a6, a0, a3 -; RV32ZICOND-NEXT: add a4, a6, a4 -; RV32ZICOND-NEXT: sltu a6, a4, a6 -; RV32ZICOND-NEXT: mulhu a7, a0, a3 -; RV32ZICOND-NEXT: add a6, a7, a6 +; RV32ZICOND-NEXT: sltu a6, a4, a7 +; RV32ZICOND-NEXT: add t2, s0, t2 +; RV32ZICOND-NEXT: add t3, t3, t6 +; RV32ZICOND-NEXT: sltu a7, t5, t6 +; RV32ZICOND-NEXT: srai a4, a4, 31 +; RV32ZICOND-NEXT: add a6, t0, a6 +; RV32ZICOND-NEXT: add t2, t3, t2 ; RV32ZICOND-NEXT: add a6, a5, a6 -; RV32ZICOND-NEXT: mul a7, a1, a3 -; RV32ZICOND-NEXT: add t0, a7, a6 -; RV32ZICOND-NEXT: srai t1, a1, 31 -; RV32ZICOND-NEXT: mul t2, a2, t1 -; RV32ZICOND-NEXT: srai t3, a3, 31 -; RV32ZICOND-NEXT: mul t4, t3, a0 -; RV32ZICOND-NEXT: add t5, t4, t2 -; RV32ZICOND-NEXT: add t6, t0, t5 -; RV32ZICOND-NEXT: sltu s0, t6, t0 -; RV32ZICOND-NEXT: sltu a7, t0, a7 +; RV32ZICOND-NEXT: add a7, t2, a7 +; RV32ZICOND-NEXT: add t0, t1, a6 ; RV32ZICOND-NEXT: sltu a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a6, a1, a3 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: add a5, a5, a7 -; RV32ZICOND-NEXT: mulhu a6, a2, t1 -; RV32ZICOND-NEXT: add a6, a6, t2 -; RV32ZICOND-NEXT: mul a7, a3, t1 -; RV32ZICOND-NEXT: add a6, a6, a7 -; RV32ZICOND-NEXT: mul a7, t3, a1 -; RV32ZICOND-NEXT: mulhu t0, t3, a0 -; RV32ZICOND-NEXT: add a7, t0, a7 -; RV32ZICOND-NEXT: add a7, a7, t4 -; RV32ZICOND-NEXT: add a6, a7, a6 -; RV32ZICOND-NEXT: sltu a7, t5, t4 -; RV32ZICOND-NEXT: add a6, a6, a7 +; RV32ZICOND-NEXT: add t5, t0, t5 +; RV32ZICOND-NEXT: sltu a6, t0, t1 +; RV32ZICOND-NEXT: add a5, t4, a5 +; RV32ZICOND-NEXT: sltu t0, t5, t0 ; RV32ZICOND-NEXT: add a5, a5, a6 -; RV32ZICOND-NEXT: add a5, a5, s0 -; RV32ZICOND-NEXT: srai a4, a4, 31 -; RV32ZICOND-NEXT: xor a5, a5, a4 -; RV32ZICOND-NEXT: xor a4, t6, a4 -; RV32ZICOND-NEXT: or a4, a4, a5 +; RV32ZICOND-NEXT: xor a6, t5, a4 +; RV32ZICOND-NEXT: add a5, a5, a7 +; RV32ZICOND-NEXT: add a5, a5, t0 +; RV32ZICOND-NEXT: xor a4, a5, a4 +; RV32ZICOND-NEXT: or a4, a6, a4 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZICOND-NEXT: .cfi_restore s0 ; RV32ZICOND-NEXT: addi sp, sp, 16 @@ -3555,58 +3540,50 @@ entry: define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: smulo.not.i64: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a5, a4, a5 ; RV32-NEXT: mulhu a6, a1, a2 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: mul a6, a0, a3 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: sltu a6, a4, a6 -; RV32-NEXT: mulhu a7, a0, a3 -; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: mul a7, a1, a3 -; RV32-NEXT: add t0, a7, a6 -; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: mul a7, a0, a3 +; RV32-NEXT: mulhu t0, a0, a3 +; RV32-NEXT: mul t1, a1, a3 +; RV32-NEXT: srai t2, a1, 31 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 -; RV32-NEXT: sltu a7, t0, a7 -; RV32-NEXT: sltu a5, a6, a5 -; RV32-NEXT: mulhu a6, a1, a3 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: add a5, a5, a7 -; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: mulhu t4, a1, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mul t5, a2, t2 +; RV32-NEXT: mul t6, t3, a0 +; RV32-NEXT: mulhu a2, a2, t2 +; RV32-NEXT: mul a3, a3, t2 ; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 +; RV32-NEXT: sltu a5, a4, a5 +; RV32-NEXT: add a4, a7, a4 +; RV32-NEXT: add t2, t6, t5 +; RV32-NEXT: add a2, a2, t5 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a1, a4, a7 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, t6 +; RV32-NEXT: sltu a3, t2, t6 ; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: add a1, t0, a1 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a2, t1, a1 +; RV32-NEXT: sltu a1, a1, a5 +; RV32-NEXT: add t2, a2, t2 +; RV32-NEXT: sltu a3, a2, t1 +; RV32-NEXT: add a1, t4, a1 +; RV32-NEXT: sltu a2, t2, a2 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: xor a0, a0, a4 -; RV32-NEXT: xor a1, t6, a4 +; RV32-NEXT: xor a1, t2, a4 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: smulo.not.i64: @@ -3620,58 +3597,50 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; ; RV32ZBA-LABEL: smulo.not.i64: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: addi sp, sp, -16 -; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 -; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a5, a4, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a2 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: mul a6, a0, a3 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: sltu a6, a4, a6 -; RV32ZBA-NEXT: mulhu a7, a0, a3 -; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: add a6, a5, a6 -; RV32ZBA-NEXT: mul a7, a1, a3 -; RV32ZBA-NEXT: add t0, a7, a6 -; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: mul a7, a0, a3 +; RV32ZBA-NEXT: mulhu t0, a0, a3 +; RV32ZBA-NEXT: mul t1, a1, a3 +; RV32ZBA-NEXT: srai t2, a1, 31 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 -; RV32ZBA-NEXT: sltu a7, t0, a7 -; RV32ZBA-NEXT: sltu a5, a6, a5 -; RV32ZBA-NEXT: mulhu a6, a1, a3 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: add a5, a5, a7 -; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: mulhu t4, a1, a3 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mul t5, a2, t2 +; RV32ZBA-NEXT: mul t6, t3, a0 +; RV32ZBA-NEXT: mulhu a2, a2, t2 +; RV32ZBA-NEXT: mul a3, a3, t2 ; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 +; RV32ZBA-NEXT: sltu a5, a4, a5 +; RV32ZBA-NEXT: add a4, a7, a4 +; RV32ZBA-NEXT: add t2, t6, t5 +; RV32ZBA-NEXT: add a2, a2, t5 ; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 -; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a5, a6, a5 +; RV32ZBA-NEXT: sltu a1, a4, a7 +; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: add a0, a0, t6 +; RV32ZBA-NEXT: sltu a3, t2, t6 ; RV32ZBA-NEXT: srai a4, a4, 31 +; RV32ZBA-NEXT: add a1, t0, a1 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: add a1, a5, a1 +; RV32ZBA-NEXT: add a0, a0, a3 +; RV32ZBA-NEXT: add a2, t1, a1 +; RV32ZBA-NEXT: sltu a1, a1, a5 +; RV32ZBA-NEXT: add t2, a2, t2 +; RV32ZBA-NEXT: sltu a3, a2, t1 +; RV32ZBA-NEXT: add a1, t4, a1 +; RV32ZBA-NEXT: sltu a2, t2, a2 +; RV32ZBA-NEXT: add a1, a1, a3 +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: add a0, a0, a2 ; RV32ZBA-NEXT: xor a0, a0, a4 -; RV32ZBA-NEXT: xor a1, t6, a4 +; RV32ZBA-NEXT: xor a1, t2, a4 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: seqz a0, a0 -; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: .cfi_restore s0 -; RV32ZBA-NEXT: addi sp, sp, 16 -; RV32ZBA-NEXT: .cfi_def_cfa_offset 0 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: smulo.not.i64: @@ -3685,58 +3654,50 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; ; RV32ZICOND-LABEL: smulo.not.i64: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: addi sp, sp, -16 -; RV32ZICOND-NEXT: .cfi_def_cfa_offset 16 -; RV32ZICOND-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZICOND-NEXT: .cfi_offset s0, -4 ; RV32ZICOND-NEXT: mulhu a4, a0, a2 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a5, a4, a5 ; RV32ZICOND-NEXT: mulhu a6, a1, a2 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: mul a6, a0, a3 -; RV32ZICOND-NEXT: add a4, a6, a4 -; RV32ZICOND-NEXT: sltu a6, a4, a6 -; RV32ZICOND-NEXT: mulhu a7, a0, a3 -; RV32ZICOND-NEXT: add a6, a7, a6 -; RV32ZICOND-NEXT: add a6, a5, a6 -; RV32ZICOND-NEXT: mul a7, a1, a3 -; RV32ZICOND-NEXT: add t0, a7, a6 -; RV32ZICOND-NEXT: srai t1, a1, 31 -; RV32ZICOND-NEXT: mul t2, a2, t1 +; RV32ZICOND-NEXT: mul a7, a0, a3 +; RV32ZICOND-NEXT: mulhu t0, a0, a3 +; RV32ZICOND-NEXT: mul t1, a1, a3 +; RV32ZICOND-NEXT: srai t2, a1, 31 ; RV32ZICOND-NEXT: srai t3, a3, 31 -; RV32ZICOND-NEXT: mul t4, t3, a0 -; RV32ZICOND-NEXT: add t5, t4, t2 -; RV32ZICOND-NEXT: add t6, t0, t5 -; RV32ZICOND-NEXT: sltu s0, t6, t0 -; RV32ZICOND-NEXT: sltu a7, t0, a7 -; RV32ZICOND-NEXT: sltu a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a6, a1, a3 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: add a5, a5, a7 -; RV32ZICOND-NEXT: mulhu a2, a2, t1 -; RV32ZICOND-NEXT: add a2, a2, t2 -; RV32ZICOND-NEXT: mul a3, a3, t1 -; RV32ZICOND-NEXT: add a2, a2, a3 +; RV32ZICOND-NEXT: mulhu t4, a1, a3 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: mul t5, a2, t2 +; RV32ZICOND-NEXT: mul t6, t3, a0 +; RV32ZICOND-NEXT: mulhu a2, a2, t2 +; RV32ZICOND-NEXT: mul a3, a3, t2 ; RV32ZICOND-NEXT: mul a1, t3, a1 ; RV32ZICOND-NEXT: mulhu a0, t3, a0 +; RV32ZICOND-NEXT: sltu a5, a4, a5 +; RV32ZICOND-NEXT: add a4, a7, a4 +; RV32ZICOND-NEXT: add t2, t6, t5 +; RV32ZICOND-NEXT: add a2, a2, t5 ; RV32ZICOND-NEXT: add a0, a0, a1 -; RV32ZICOND-NEXT: add a0, a0, t4 -; RV32ZICOND-NEXT: add a0, a0, a2 -; RV32ZICOND-NEXT: sltu a1, t5, t4 -; RV32ZICOND-NEXT: add a0, a0, a1 -; RV32ZICOND-NEXT: add a0, a5, a0 -; RV32ZICOND-NEXT: add a0, a0, s0 +; RV32ZICOND-NEXT: add a5, a6, a5 +; RV32ZICOND-NEXT: sltu a1, a4, a7 +; RV32ZICOND-NEXT: add a2, a2, a3 +; RV32ZICOND-NEXT: add a0, a0, t6 +; RV32ZICOND-NEXT: sltu a3, t2, t6 ; RV32ZICOND-NEXT: srai a4, a4, 31 +; RV32ZICOND-NEXT: add a1, t0, a1 +; RV32ZICOND-NEXT: add a0, a0, a2 +; RV32ZICOND-NEXT: add a1, a5, a1 +; RV32ZICOND-NEXT: add a0, a0, a3 +; RV32ZICOND-NEXT: add a2, t1, a1 +; RV32ZICOND-NEXT: sltu a1, a1, a5 +; RV32ZICOND-NEXT: add t2, a2, t2 +; RV32ZICOND-NEXT: sltu a3, a2, t1 +; RV32ZICOND-NEXT: add a1, t4, a1 +; RV32ZICOND-NEXT: sltu a2, t2, a2 +; RV32ZICOND-NEXT: add a1, a1, a3 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: add a0, a0, a2 ; RV32ZICOND-NEXT: xor a0, a0, a4 -; RV32ZICOND-NEXT: xor a1, t6, a4 +; RV32ZICOND-NEXT: xor a1, t2, a4 ; RV32ZICOND-NEXT: or a0, a1, a0 ; RV32ZICOND-NEXT: seqz a0, a0 -; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZICOND-NEXT: .cfi_restore s0 -; RV32ZICOND-NEXT: addi sp, sp, 16 -; RV32ZICOND-NEXT: .cfi_def_cfa_offset 0 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: smulo.not.i64: @@ -3879,16 +3840,16 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 +; RV32-NEXT: snez a6, a3 ; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: mulhu a5, a0, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a4, a4, a5 -; RV32-NEXT: snez a5, a3 -; RV32-NEXT: snez a6, a1 -; RV32-NEXT: and a5, a6, a5 +; RV32-NEXT: snez a5, a1 +; RV32-NEXT: and a5, a5, a6 ; RV32-NEXT: mulhu a6, a1, a2 ; RV32-NEXT: snez a6, a6 ; RV32-NEXT: or a5, a5, a6 +; RV32-NEXT: mulhu a6, a0, a2 +; RV32-NEXT: add a4, a6, a4 +; RV32-NEXT: sltu a4, a4, a6 ; RV32-NEXT: mulhu a6, a3, a0 ; RV32-NEXT: snez a6, a6 ; RV32-NEXT: or a5, a5, a6 @@ -3913,16 +3874,16 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: snez a6, a3 ; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: mulhu a5, a0, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a4, a4, a5 -; RV32ZBA-NEXT: snez a5, a3 -; RV32ZBA-NEXT: snez a6, a1 -; RV32ZBA-NEXT: and a5, a6, a5 +; RV32ZBA-NEXT: snez a5, a1 +; RV32ZBA-NEXT: and a5, a5, a6 ; RV32ZBA-NEXT: mulhu a6, a1, a2 ; RV32ZBA-NEXT: snez a6, a6 ; RV32ZBA-NEXT: or a5, a5, a6 +; RV32ZBA-NEXT: mulhu a6, a0, a2 +; RV32ZBA-NEXT: add a4, a6, a4 +; RV32ZBA-NEXT: sltu a4, a4, a6 ; RV32ZBA-NEXT: mulhu a6, a3, a0 ; RV32ZBA-NEXT: snez a6, a6 ; RV32ZBA-NEXT: or a5, a5, a6 @@ -3947,26 +3908,26 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a4, a3, a0 ; RV32ZICOND-NEXT: mul a5, a1, a2 +; RV32ZICOND-NEXT: snez a6, a3 ; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: mulhu a5, a0, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a4, a4, a5 -; RV32ZICOND-NEXT: snez a5, a3 -; RV32ZICOND-NEXT: snez a6, a1 -; RV32ZICOND-NEXT: and a5, a6, a5 +; RV32ZICOND-NEXT: snez a5, a1 +; RV32ZICOND-NEXT: and a5, a5, a6 ; RV32ZICOND-NEXT: mulhu a6, a1, a2 ; RV32ZICOND-NEXT: snez a6, a6 ; RV32ZICOND-NEXT: or a5, a5, a6 +; RV32ZICOND-NEXT: mulhu a6, a0, a2 +; RV32ZICOND-NEXT: add a4, a6, a4 +; RV32ZICOND-NEXT: sltu a4, a4, a6 ; RV32ZICOND-NEXT: mulhu a6, a3, a0 ; RV32ZICOND-NEXT: snez a6, a6 ; RV32ZICOND-NEXT: or a5, a5, a6 ; RV32ZICOND-NEXT: or a4, a5, a4 ; RV32ZICOND-NEXT: czero.nez a2, a2, a4 ; RV32ZICOND-NEXT: czero.eqz a0, a0, a4 -; RV32ZICOND-NEXT: or a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a2, a3, a4 +; RV32ZICOND-NEXT: czero.nez a3, a3, a4 ; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 -; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: or a0, a0, a2 +; RV32ZICOND-NEXT: or a1, a1, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo.select.i64: @@ -3988,20 +3949,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: mulhu a5, a0, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a4, a4, a5 -; RV32-NEXT: snez a5, a3 -; RV32-NEXT: snez a6, a1 -; RV32-NEXT: and a5, a6, a5 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: mulhu a6, a0, a2 ; RV32-NEXT: mulhu a0, a3, a0 +; RV32-NEXT: snez a3, a3 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: snez a2, a2 ; RV32-NEXT: snez a0, a0 +; RV32-NEXT: add a4, a6, a4 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: sltu a2, a4, a6 ; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret ; @@ -4015,20 +3976,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: mulhu a5, a0, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a4, a4, a5 -; RV32ZBA-NEXT: snez a5, a3 -; RV32ZBA-NEXT: snez a6, a1 -; RV32ZBA-NEXT: and a5, a6, a5 -; RV32ZBA-NEXT: mulhu a1, a1, a2 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: or a1, a5, a1 +; RV32ZBA-NEXT: mulhu a6, a0, a2 ; RV32ZBA-NEXT: mulhu a0, a3, a0 +; RV32ZBA-NEXT: snez a3, a3 +; RV32ZBA-NEXT: mulhu a2, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: and a1, a1, a3 +; RV32ZBA-NEXT: snez a2, a2 ; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: add a4, a6, a4 +; RV32ZBA-NEXT: or a1, a1, a2 +; RV32ZBA-NEXT: sltu a2, a4, a6 ; RV32ZBA-NEXT: or a0, a1, a0 -; RV32ZBA-NEXT: or a0, a0, a4 +; RV32ZBA-NEXT: or a0, a0, a2 ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret ; @@ -4042,20 +4003,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a4, a3, a0 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: mulhu a5, a0, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a4, a4, a5 -; RV32ZICOND-NEXT: snez a5, a3 -; RV32ZICOND-NEXT: snez a6, a1 -; RV32ZICOND-NEXT: and a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a1, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: or a1, a5, a1 +; RV32ZICOND-NEXT: mulhu a6, a0, a2 ; RV32ZICOND-NEXT: mulhu a0, a3, a0 +; RV32ZICOND-NEXT: snez a3, a3 +; RV32ZICOND-NEXT: mulhu a2, a1, a2 +; RV32ZICOND-NEXT: snez a1, a1 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: and a1, a1, a3 +; RV32ZICOND-NEXT: snez a2, a2 ; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: add a4, a6, a4 +; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: sltu a2, a4, a6 ; RV32ZICOND-NEXT: or a0, a1, a0 -; RV32ZICOND-NEXT: or a0, a0, a4 +; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: xori a0, a0, 1 ; RV32ZICOND-NEXT: ret ; @@ -4168,11 +4129,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a4, a1, a3 ; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: xor a3, a1, a3 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a0, a4, a0 ; RV32-NEXT: xor a0, a1, a0 -; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: not a1, a1 +; RV32-NEXT: not a1, a3 ; RV32-NEXT: and a0, a1, a0 ; RV32-NEXT: bgez a0, .LBB53_2 ; RV32-NEXT: # %bb.1: # %overflow @@ -4199,11 +4160,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a4, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: xor a3, a1, a3 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a0, a4, a0 ; RV32ZBA-NEXT: xor a0, a1, a0 -; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: not a1, a3 ; RV32ZBA-NEXT: and a0, a1, a0 ; RV32ZBA-NEXT: bgez a0, .LBB53_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow @@ -4230,11 +4191,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: add a4, a1, a3 ; RV32ZICOND-NEXT: add a2, a0, a2 +; RV32ZICOND-NEXT: xor a3, a1, a3 ; RV32ZICOND-NEXT: sltu a0, a2, a0 ; RV32ZICOND-NEXT: add a0, a4, a0 ; RV32ZICOND-NEXT: xor a0, a1, a0 -; RV32ZICOND-NEXT: xor a1, a1, a3 -; RV32ZICOND-NEXT: not a1, a1 +; RV32ZICOND-NEXT: not a1, a3 ; RV32ZICOND-NEXT: and a0, a1, a0 ; RV32ZICOND-NEXT: bgez a0, .LBB53_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow @@ -4786,13 +4747,13 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: sltu a4, a0, a2 ; RV32ZICOND-NEXT: sub a3, a1, a3 -; RV32ZICOND-NEXT: sub a3, a3, a4 -; RV32ZICOND-NEXT: xor a4, a3, a1 -; RV32ZICOND-NEXT: sltu a1, a1, a3 -; RV32ZICOND-NEXT: czero.eqz a1, a1, a4 ; RV32ZICOND-NEXT: sub a2, a0, a2 +; RV32ZICOND-NEXT: sub a3, a3, a4 ; RV32ZICOND-NEXT: sltu a0, a0, a2 -; RV32ZICOND-NEXT: czero.nez a0, a0, a4 +; RV32ZICOND-NEXT: xor a2, a3, a1 +; RV32ZICOND-NEXT: sltu a1, a1, a3 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a2 +; RV32ZICOND-NEXT: czero.nez a0, a0, a2 ; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: beqz a0, .LBB59_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow @@ -4916,64 +4877,55 @@ continue: define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: smulo.br.i64: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a5, a4, a5 ; RV32-NEXT: mulhu a6, a1, a2 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: mul a6, a0, a3 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: sltu a6, a4, a6 -; RV32-NEXT: mulhu a7, a0, a3 -; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: mul a7, a1, a3 -; RV32-NEXT: add t0, a7, a6 -; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: mul a7, a0, a3 +; RV32-NEXT: mulhu t0, a0, a3 +; RV32-NEXT: mul t1, a1, a3 +; RV32-NEXT: srai t2, a1, 31 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 -; RV32-NEXT: sltu a7, t0, a7 -; RV32-NEXT: sltu a5, a6, a5 -; RV32-NEXT: mulhu a6, a1, a3 -; RV32-NEXT: add a5, a6, a5 -; RV32-NEXT: add a5, a5, a7 -; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: mulhu t4, a1, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mul t5, a2, t2 +; RV32-NEXT: mul t6, t3, a0 +; RV32-NEXT: mulhu a2, a2, t2 +; RV32-NEXT: mul a3, a3, t2 ; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 +; RV32-NEXT: sltu a5, a4, a5 +; RV32-NEXT: add a4, a7, a4 +; RV32-NEXT: add t2, t6, t5 +; RV32-NEXT: add a2, a2, t5 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a1, a4, a7 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, t6 +; RV32-NEXT: sltu a3, t2, t6 ; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: add a1, t0, a1 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: add a2, t1, a1 +; RV32-NEXT: sltu a1, a1, a5 +; RV32-NEXT: add t2, a2, t2 +; RV32-NEXT: sltu a3, a2, t1 +; RV32-NEXT: add a1, t4, a1 +; RV32-NEXT: sltu a2, t2, a2 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: xor a0, a0, a4 -; RV32-NEXT: xor a1, t6, a4 +; RV32-NEXT: xor a1, t2, a4 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: beqz a0, .LBB61_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 -; RV32-NEXT: j .LBB61_3 +; RV32-NEXT: ret ; RV32-NEXT: .LBB61_2: # %continue ; RV32-NEXT: li a0, 1 -; RV32-NEXT: .LBB61_3: # %overflow -; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: smulo.br.i64: @@ -4991,64 +4943,55 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; ; RV32ZBA-LABEL: smulo.br.i64: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: addi sp, sp, -16 -; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 -; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a5, a4, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a2 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: mul a6, a0, a3 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: sltu a6, a4, a6 -; RV32ZBA-NEXT: mulhu a7, a0, a3 -; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: add a6, a5, a6 -; RV32ZBA-NEXT: mul a7, a1, a3 -; RV32ZBA-NEXT: add t0, a7, a6 -; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: mul a7, a0, a3 +; RV32ZBA-NEXT: mulhu t0, a0, a3 +; RV32ZBA-NEXT: mul t1, a1, a3 +; RV32ZBA-NEXT: srai t2, a1, 31 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 -; RV32ZBA-NEXT: sltu a7, t0, a7 -; RV32ZBA-NEXT: sltu a5, a6, a5 -; RV32ZBA-NEXT: mulhu a6, a1, a3 -; RV32ZBA-NEXT: add a5, a6, a5 -; RV32ZBA-NEXT: add a5, a5, a7 -; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: mulhu t4, a1, a3 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mul t5, a2, t2 +; RV32ZBA-NEXT: mul t6, t3, a0 +; RV32ZBA-NEXT: mulhu a2, a2, t2 +; RV32ZBA-NEXT: mul a3, a3, t2 ; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 +; RV32ZBA-NEXT: sltu a5, a4, a5 +; RV32ZBA-NEXT: add a4, a7, a4 +; RV32ZBA-NEXT: add t2, t6, t5 +; RV32ZBA-NEXT: add a2, a2, t5 ; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 -; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a5, a6, a5 +; RV32ZBA-NEXT: sltu a1, a4, a7 +; RV32ZBA-NEXT: add a2, a2, a3 +; RV32ZBA-NEXT: add a0, a0, t6 +; RV32ZBA-NEXT: sltu a3, t2, t6 ; RV32ZBA-NEXT: srai a4, a4, 31 +; RV32ZBA-NEXT: add a1, t0, a1 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: add a1, a5, a1 +; RV32ZBA-NEXT: add a0, a0, a3 +; RV32ZBA-NEXT: add a2, t1, a1 +; RV32ZBA-NEXT: sltu a1, a1, a5 +; RV32ZBA-NEXT: add t2, a2, t2 +; RV32ZBA-NEXT: sltu a3, a2, t1 +; RV32ZBA-NEXT: add a1, t4, a1 +; RV32ZBA-NEXT: sltu a2, t2, a2 +; RV32ZBA-NEXT: add a1, a1, a3 +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: add a0, a0, a2 ; RV32ZBA-NEXT: xor a0, a0, a4 -; RV32ZBA-NEXT: xor a1, t6, a4 +; RV32ZBA-NEXT: xor a1, t2, a4 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: beqz a0, .LBB61_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 -; RV32ZBA-NEXT: j .LBB61_3 +; RV32ZBA-NEXT: ret ; RV32ZBA-NEXT: .LBB61_2: # %continue ; RV32ZBA-NEXT: li a0, 1 -; RV32ZBA-NEXT: .LBB61_3: # %overflow -; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZBA-NEXT: .cfi_restore s0 -; RV32ZBA-NEXT: addi sp, sp, 16 -; RV32ZBA-NEXT: .cfi_def_cfa_offset 0 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: smulo.br.i64: @@ -5066,64 +5009,55 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; ; RV32ZICOND-LABEL: smulo.br.i64: ; RV32ZICOND: # %bb.0: # %entry -; RV32ZICOND-NEXT: addi sp, sp, -16 -; RV32ZICOND-NEXT: .cfi_def_cfa_offset 16 -; RV32ZICOND-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZICOND-NEXT: .cfi_offset s0, -4 ; RV32ZICOND-NEXT: mulhu a4, a0, a2 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a5, a4, a5 ; RV32ZICOND-NEXT: mulhu a6, a1, a2 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: mul a6, a0, a3 -; RV32ZICOND-NEXT: add a4, a6, a4 -; RV32ZICOND-NEXT: sltu a6, a4, a6 -; RV32ZICOND-NEXT: mulhu a7, a0, a3 -; RV32ZICOND-NEXT: add a6, a7, a6 -; RV32ZICOND-NEXT: add a6, a5, a6 -; RV32ZICOND-NEXT: mul a7, a1, a3 -; RV32ZICOND-NEXT: add t0, a7, a6 -; RV32ZICOND-NEXT: srai t1, a1, 31 -; RV32ZICOND-NEXT: mul t2, a2, t1 +; RV32ZICOND-NEXT: mul a7, a0, a3 +; RV32ZICOND-NEXT: mulhu t0, a0, a3 +; RV32ZICOND-NEXT: mul t1, a1, a3 +; RV32ZICOND-NEXT: srai t2, a1, 31 ; RV32ZICOND-NEXT: srai t3, a3, 31 -; RV32ZICOND-NEXT: mul t4, t3, a0 -; RV32ZICOND-NEXT: add t5, t4, t2 -; RV32ZICOND-NEXT: add t6, t0, t5 -; RV32ZICOND-NEXT: sltu s0, t6, t0 -; RV32ZICOND-NEXT: sltu a7, t0, a7 -; RV32ZICOND-NEXT: sltu a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a6, a1, a3 -; RV32ZICOND-NEXT: add a5, a6, a5 -; RV32ZICOND-NEXT: add a5, a5, a7 -; RV32ZICOND-NEXT: mulhu a2, a2, t1 -; RV32ZICOND-NEXT: add a2, a2, t2 -; RV32ZICOND-NEXT: mul a3, a3, t1 -; RV32ZICOND-NEXT: add a2, a2, a3 +; RV32ZICOND-NEXT: mulhu t4, a1, a3 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: mul t5, a2, t2 +; RV32ZICOND-NEXT: mul t6, t3, a0 +; RV32ZICOND-NEXT: mulhu a2, a2, t2 +; RV32ZICOND-NEXT: mul a3, a3, t2 ; RV32ZICOND-NEXT: mul a1, t3, a1 ; RV32ZICOND-NEXT: mulhu a0, t3, a0 +; RV32ZICOND-NEXT: sltu a5, a4, a5 +; RV32ZICOND-NEXT: add a4, a7, a4 +; RV32ZICOND-NEXT: add t2, t6, t5 +; RV32ZICOND-NEXT: add a2, a2, t5 ; RV32ZICOND-NEXT: add a0, a0, a1 -; RV32ZICOND-NEXT: add a0, a0, t4 -; RV32ZICOND-NEXT: add a0, a0, a2 -; RV32ZICOND-NEXT: sltu a1, t5, t4 -; RV32ZICOND-NEXT: add a0, a0, a1 -; RV32ZICOND-NEXT: add a0, a5, a0 -; RV32ZICOND-NEXT: add a0, a0, s0 +; RV32ZICOND-NEXT: add a5, a6, a5 +; RV32ZICOND-NEXT: sltu a1, a4, a7 +; RV32ZICOND-NEXT: add a2, a2, a3 +; RV32ZICOND-NEXT: add a0, a0, t6 +; RV32ZICOND-NEXT: sltu a3, t2, t6 ; RV32ZICOND-NEXT: srai a4, a4, 31 +; RV32ZICOND-NEXT: add a1, t0, a1 +; RV32ZICOND-NEXT: add a0, a0, a2 +; RV32ZICOND-NEXT: add a1, a5, a1 +; RV32ZICOND-NEXT: add a0, a0, a3 +; RV32ZICOND-NEXT: add a2, t1, a1 +; RV32ZICOND-NEXT: sltu a1, a1, a5 +; RV32ZICOND-NEXT: add t2, a2, t2 +; RV32ZICOND-NEXT: sltu a3, a2, t1 +; RV32ZICOND-NEXT: add a1, t4, a1 +; RV32ZICOND-NEXT: sltu a2, t2, a2 +; RV32ZICOND-NEXT: add a1, a1, a3 +; RV32ZICOND-NEXT: add a0, a1, a0 +; RV32ZICOND-NEXT: add a0, a0, a2 ; RV32ZICOND-NEXT: xor a0, a0, a4 -; RV32ZICOND-NEXT: xor a1, t6, a4 +; RV32ZICOND-NEXT: xor a1, t2, a4 ; RV32ZICOND-NEXT: or a0, a1, a0 ; RV32ZICOND-NEXT: beqz a0, .LBB61_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow ; RV32ZICOND-NEXT: li a0, 0 -; RV32ZICOND-NEXT: j .LBB61_3 +; RV32ZICOND-NEXT: ret ; RV32ZICOND-NEXT: .LBB61_2: # %continue ; RV32ZICOND-NEXT: li a0, 1 -; RV32ZICOND-NEXT: .LBB61_3: # %overflow -; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32ZICOND-NEXT: .cfi_restore s0 -; RV32ZICOND-NEXT: addi sp, sp, 16 -; RV32ZICOND-NEXT: .cfi_def_cfa_offset 0 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: smulo.br.i64: @@ -5155,43 +5089,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32-LABEL: smulo2.br.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a2, -13 -; RV32-NEXT: mulhu a3, a0, a2 -; RV32-NEXT: mul a4, a1, a2 -; RV32-NEXT: add a3, a4, a3 -; RV32-NEXT: sltu a4, a3, a4 -; RV32-NEXT: mulhu a5, a1, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sub a3, a3, a0 -; RV32-NEXT: neg a5, a0 -; RV32-NEXT: sltu a6, a3, a5 -; RV32-NEXT: li a7, -1 -; RV32-NEXT: mulhu t0, a0, a7 -; RV32-NEXT: add a6, t0, a6 -; RV32-NEXT: add a6, a4, a6 -; RV32-NEXT: sub t1, a6, a1 -; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, t2, a2 -; RV32-NEXT: sub t3, t3, a0 -; RV32-NEXT: add t4, t1, t3 -; RV32-NEXT: sltu t5, t4, t1 -; RV32-NEXT: neg t6, a1 -; RV32-NEXT: sltu t1, t1, t6 -; RV32-NEXT: sltu a4, a6, a4 -; RV32-NEXT: mulhu a6, a1, a7 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: add a4, a4, t1 -; RV32-NEXT: sltu a5, t3, a5 -; RV32-NEXT: mulh a2, t2, a2 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: neg a3, a0 +; RV32-NEXT: li a4, -1 +; RV32-NEXT: srai a5, a1, 31 +; RV32-NEXT: neg a6, a1 +; RV32-NEXT: add a7, a0, a1 +; RV32-NEXT: mulhu t0, a0, a2 +; RV32-NEXT: mul t1, a1, a2 +; RV32-NEXT: mulhu t2, a1, a2 +; RV32-NEXT: mulhu t3, a0, a4 +; RV32-NEXT: mul t4, a5, a2 +; RV32-NEXT: mulhu a4, a1, a4 +; RV32-NEXT: mulh a2, a5, a2 +; RV32-NEXT: add t0, t1, t0 +; RV32-NEXT: sub a5, t4, a0 +; RV32-NEXT: sub a7, t3, a7 +; RV32-NEXT: sltu t1, t0, t1 ; RV32-NEXT: sub a0, t0, a0 -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a0, a0, a5 -; RV32-NEXT: add a0, a4, a0 -; RV32-NEXT: add a0, a0, t5 -; RV32-NEXT: srai a3, a3, 31 -; RV32-NEXT: xor a0, a0, a3 -; RV32-NEXT: xor a1, t4, a3 -; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: sltu t0, a5, a3 +; RV32-NEXT: add a2, a7, a2 +; RV32-NEXT: add t1, t2, t1 +; RV32-NEXT: sltu a3, a0, a3 +; RV32-NEXT: add a2, a2, t0 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: add a3, t3, a3 +; RV32-NEXT: add a3, t1, a3 +; RV32-NEXT: sub a1, a3, a1 +; RV32-NEXT: sltu a3, a3, t1 +; RV32-NEXT: add a5, a1, a5 +; RV32-NEXT: sltu a6, a1, a6 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: sltu a1, a5, a1 +; RV32-NEXT: add a3, a3, a6 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: xor a1, a1, a0 +; RV32-NEXT: xor a0, a5, a0 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: beqz a0, .LBB62_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 @@ -5217,43 +5151,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32ZBA-LABEL: smulo2.br.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a2, -13 -; RV32ZBA-NEXT: mulhu a3, a0, a2 -; RV32ZBA-NEXT: mul a4, a1, a2 -; RV32ZBA-NEXT: add a3, a4, a3 -; RV32ZBA-NEXT: sltu a4, a3, a4 -; RV32ZBA-NEXT: mulhu a5, a1, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sub a3, a3, a0 -; RV32ZBA-NEXT: neg a5, a0 -; RV32ZBA-NEXT: sltu a6, a3, a5 -; RV32ZBA-NEXT: li a7, -1 -; RV32ZBA-NEXT: mulhu t0, a0, a7 -; RV32ZBA-NEXT: add a6, t0, a6 -; RV32ZBA-NEXT: add a6, a4, a6 -; RV32ZBA-NEXT: sub t1, a6, a1 -; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, t2, a2 -; RV32ZBA-NEXT: sub t3, t3, a0 -; RV32ZBA-NEXT: add t4, t1, t3 -; RV32ZBA-NEXT: sltu t5, t4, t1 -; RV32ZBA-NEXT: neg t6, a1 -; RV32ZBA-NEXT: sltu t1, t1, t6 -; RV32ZBA-NEXT: sltu a4, a6, a4 -; RV32ZBA-NEXT: mulhu a6, a1, a7 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: add a4, a4, t1 -; RV32ZBA-NEXT: sltu a5, t3, a5 -; RV32ZBA-NEXT: mulh a2, t2, a2 -; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: neg a3, a0 +; RV32ZBA-NEXT: li a4, -1 +; RV32ZBA-NEXT: srai a5, a1, 31 +; RV32ZBA-NEXT: neg a6, a1 +; RV32ZBA-NEXT: add a7, a0, a1 +; RV32ZBA-NEXT: mulhu t0, a0, a2 +; RV32ZBA-NEXT: mul t1, a1, a2 +; RV32ZBA-NEXT: mulhu t2, a1, a2 +; RV32ZBA-NEXT: mulhu t3, a0, a4 +; RV32ZBA-NEXT: mul t4, a5, a2 +; RV32ZBA-NEXT: mulhu a4, a1, a4 +; RV32ZBA-NEXT: mulh a2, a5, a2 +; RV32ZBA-NEXT: add t0, t1, t0 +; RV32ZBA-NEXT: sub a5, t4, a0 +; RV32ZBA-NEXT: sub a7, t3, a7 +; RV32ZBA-NEXT: sltu t1, t0, t1 ; RV32ZBA-NEXT: sub a0, t0, a0 -; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: add a0, a0, a5 -; RV32ZBA-NEXT: add a0, a4, a0 -; RV32ZBA-NEXT: add a0, a0, t5 -; RV32ZBA-NEXT: srai a3, a3, 31 -; RV32ZBA-NEXT: xor a0, a0, a3 -; RV32ZBA-NEXT: xor a1, t4, a3 -; RV32ZBA-NEXT: or a0, a1, a0 +; RV32ZBA-NEXT: sltu t0, a5, a3 +; RV32ZBA-NEXT: add a2, a7, a2 +; RV32ZBA-NEXT: add t1, t2, t1 +; RV32ZBA-NEXT: sltu a3, a0, a3 +; RV32ZBA-NEXT: add a2, a2, t0 +; RV32ZBA-NEXT: srai a0, a0, 31 +; RV32ZBA-NEXT: add a3, t3, a3 +; RV32ZBA-NEXT: add a3, t1, a3 +; RV32ZBA-NEXT: sub a1, a3, a1 +; RV32ZBA-NEXT: sltu a3, a3, t1 +; RV32ZBA-NEXT: add a5, a1, a5 +; RV32ZBA-NEXT: sltu a6, a1, a6 +; RV32ZBA-NEXT: add a3, a4, a3 +; RV32ZBA-NEXT: sltu a1, a5, a1 +; RV32ZBA-NEXT: add a3, a3, a6 +; RV32ZBA-NEXT: add a2, a3, a2 +; RV32ZBA-NEXT: add a1, a2, a1 +; RV32ZBA-NEXT: xor a1, a1, a0 +; RV32ZBA-NEXT: xor a0, a5, a0 +; RV32ZBA-NEXT: or a0, a0, a1 ; RV32ZBA-NEXT: beqz a0, .LBB62_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 @@ -5279,43 +5213,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32ZICOND-LABEL: smulo2.br.i64: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: li a2, -13 -; RV32ZICOND-NEXT: mulhu a3, a0, a2 -; RV32ZICOND-NEXT: mul a4, a1, a2 -; RV32ZICOND-NEXT: add a3, a4, a3 -; RV32ZICOND-NEXT: sltu a4, a3, a4 -; RV32ZICOND-NEXT: mulhu a5, a1, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sub a3, a3, a0 -; RV32ZICOND-NEXT: neg a5, a0 -; RV32ZICOND-NEXT: sltu a6, a3, a5 -; RV32ZICOND-NEXT: li a7, -1 -; RV32ZICOND-NEXT: mulhu t0, a0, a7 -; RV32ZICOND-NEXT: add a6, t0, a6 -; RV32ZICOND-NEXT: add a6, a4, a6 -; RV32ZICOND-NEXT: sub t1, a6, a1 -; RV32ZICOND-NEXT: srai t2, a1, 31 -; RV32ZICOND-NEXT: mul t3, t2, a2 -; RV32ZICOND-NEXT: sub t3, t3, a0 -; RV32ZICOND-NEXT: add t4, t1, t3 -; RV32ZICOND-NEXT: sltu t5, t4, t1 -; RV32ZICOND-NEXT: neg t6, a1 -; RV32ZICOND-NEXT: sltu t1, t1, t6 -; RV32ZICOND-NEXT: sltu a4, a6, a4 -; RV32ZICOND-NEXT: mulhu a6, a1, a7 -; RV32ZICOND-NEXT: add a4, a6, a4 -; RV32ZICOND-NEXT: add a4, a4, t1 -; RV32ZICOND-NEXT: sltu a5, t3, a5 -; RV32ZICOND-NEXT: mulh a2, t2, a2 -; RV32ZICOND-NEXT: add a0, a0, a1 +; RV32ZICOND-NEXT: neg a3, a0 +; RV32ZICOND-NEXT: li a4, -1 +; RV32ZICOND-NEXT: srai a5, a1, 31 +; RV32ZICOND-NEXT: neg a6, a1 +; RV32ZICOND-NEXT: add a7, a0, a1 +; RV32ZICOND-NEXT: mulhu t0, a0, a2 +; RV32ZICOND-NEXT: mul t1, a1, a2 +; RV32ZICOND-NEXT: mulhu t2, a1, a2 +; RV32ZICOND-NEXT: mulhu t3, a0, a4 +; RV32ZICOND-NEXT: mul t4, a5, a2 +; RV32ZICOND-NEXT: mulhu a4, a1, a4 +; RV32ZICOND-NEXT: mulh a2, a5, a2 +; RV32ZICOND-NEXT: add t0, t1, t0 +; RV32ZICOND-NEXT: sub a5, t4, a0 +; RV32ZICOND-NEXT: sub a7, t3, a7 +; RV32ZICOND-NEXT: sltu t1, t0, t1 ; RV32ZICOND-NEXT: sub a0, t0, a0 -; RV32ZICOND-NEXT: add a0, a0, a2 -; RV32ZICOND-NEXT: add a0, a0, a5 -; RV32ZICOND-NEXT: add a0, a4, a0 -; RV32ZICOND-NEXT: add a0, a0, t5 -; RV32ZICOND-NEXT: srai a3, a3, 31 -; RV32ZICOND-NEXT: xor a0, a0, a3 -; RV32ZICOND-NEXT: xor a1, t4, a3 -; RV32ZICOND-NEXT: or a0, a1, a0 +; RV32ZICOND-NEXT: sltu t0, a5, a3 +; RV32ZICOND-NEXT: add a2, a7, a2 +; RV32ZICOND-NEXT: add t1, t2, t1 +; RV32ZICOND-NEXT: sltu a3, a0, a3 +; RV32ZICOND-NEXT: add a2, a2, t0 +; RV32ZICOND-NEXT: srai a0, a0, 31 +; RV32ZICOND-NEXT: add a3, t3, a3 +; RV32ZICOND-NEXT: add a3, t1, a3 +; RV32ZICOND-NEXT: sub a1, a3, a1 +; RV32ZICOND-NEXT: sltu a3, a3, t1 +; RV32ZICOND-NEXT: add a5, a1, a5 +; RV32ZICOND-NEXT: sltu a6, a1, a6 +; RV32ZICOND-NEXT: add a3, a4, a3 +; RV32ZICOND-NEXT: sltu a1, a5, a1 +; RV32ZICOND-NEXT: add a3, a3, a6 +; RV32ZICOND-NEXT: add a2, a3, a2 +; RV32ZICOND-NEXT: add a1, a2, a1 +; RV32ZICOND-NEXT: xor a1, a1, a0 +; RV32ZICOND-NEXT: xor a0, a5, a0 +; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: beqz a0, .LBB62_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow ; RV32ZICOND-NEXT: li a0, 0 @@ -5443,20 +5377,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: mulhu a5, a0, a2 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: sltu a4, a4, a5 -; RV32-NEXT: snez a5, a3 -; RV32-NEXT: snez a6, a1 -; RV32-NEXT: and a5, a6, a5 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: snez a1, a1 -; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: mulhu a6, a0, a2 ; RV32-NEXT: mulhu a0, a3, a0 +; RV32-NEXT: snez a3, a3 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: snez a2, a2 ; RV32-NEXT: snez a0, a0 +; RV32-NEXT: add a4, a6, a4 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: sltu a2, a4, a6 ; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: beqz a0, .LBB64_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 @@ -5480,20 +5414,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: mulhu a5, a0, a2 -; RV32ZBA-NEXT: add a4, a5, a4 -; RV32ZBA-NEXT: sltu a4, a4, a5 -; RV32ZBA-NEXT: snez a5, a3 -; RV32ZBA-NEXT: snez a6, a1 -; RV32ZBA-NEXT: and a5, a6, a5 -; RV32ZBA-NEXT: mulhu a1, a1, a2 -; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: or a1, a5, a1 +; RV32ZBA-NEXT: mulhu a6, a0, a2 ; RV32ZBA-NEXT: mulhu a0, a3, a0 +; RV32ZBA-NEXT: snez a3, a3 +; RV32ZBA-NEXT: mulhu a2, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: and a1, a1, a3 +; RV32ZBA-NEXT: snez a2, a2 ; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: add a4, a6, a4 +; RV32ZBA-NEXT: or a1, a1, a2 +; RV32ZBA-NEXT: sltu a2, a4, a6 ; RV32ZBA-NEXT: or a0, a1, a0 -; RV32ZBA-NEXT: or a0, a0, a4 +; RV32ZBA-NEXT: or a0, a0, a2 ; RV32ZBA-NEXT: beqz a0, .LBB64_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 @@ -5517,20 +5451,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mul a4, a3, a0 ; RV32ZICOND-NEXT: mul a5, a1, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: mulhu a5, a0, a2 -; RV32ZICOND-NEXT: add a4, a5, a4 -; RV32ZICOND-NEXT: sltu a4, a4, a5 -; RV32ZICOND-NEXT: snez a5, a3 -; RV32ZICOND-NEXT: snez a6, a1 -; RV32ZICOND-NEXT: and a5, a6, a5 -; RV32ZICOND-NEXT: mulhu a1, a1, a2 -; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: or a1, a5, a1 +; RV32ZICOND-NEXT: mulhu a6, a0, a2 ; RV32ZICOND-NEXT: mulhu a0, a3, a0 +; RV32ZICOND-NEXT: snez a3, a3 +; RV32ZICOND-NEXT: mulhu a2, a1, a2 +; RV32ZICOND-NEXT: snez a1, a1 +; RV32ZICOND-NEXT: add a4, a5, a4 +; RV32ZICOND-NEXT: and a1, a1, a3 +; RV32ZICOND-NEXT: snez a2, a2 ; RV32ZICOND-NEXT: snez a0, a0 +; RV32ZICOND-NEXT: add a4, a6, a4 +; RV32ZICOND-NEXT: or a1, a1, a2 +; RV32ZICOND-NEXT: sltu a2, a4, a6 ; RV32ZICOND-NEXT: or a0, a1, a0 -; RV32ZICOND-NEXT: or a0, a0, a4 +; RV32ZICOND-NEXT: or a0, a0, a2 ; RV32ZICOND-NEXT: beqz a0, .LBB64_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow ; RV32ZICOND-NEXT: li a0, 0 @@ -5624,13 +5558,13 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) { ; RV32ZICOND-LABEL: umulo2.br.i64: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: add a2, a0, a0 +; RV32ZICOND-NEXT: add a3, a1, a1 ; RV32ZICOND-NEXT: sltu a0, a2, a0 -; RV32ZICOND-NEXT: add a2, a1, a1 -; RV32ZICOND-NEXT: add a2, a2, a0 -; RV32ZICOND-NEXT: xor a3, a2, a1 -; RV32ZICOND-NEXT: sltu a1, a2, a1 -; RV32ZICOND-NEXT: czero.eqz a1, a1, a3 -; RV32ZICOND-NEXT: czero.nez a0, a0, a3 +; RV32ZICOND-NEXT: add a3, a3, a0 +; RV32ZICOND-NEXT: xor a2, a3, a1 +; RV32ZICOND-NEXT: sltu a1, a3, a1 +; RV32ZICOND-NEXT: czero.eqz a1, a1, a2 +; RV32ZICOND-NEXT: czero.nez a0, a0, a2 ; RV32ZICOND-NEXT: or a0, a0, a1 ; RV32ZICOND-NEXT: beqz a0, .LBB65_2 ; RV32ZICOND-NEXT: # %bb.1: # %overflow diff --git a/llvm/test/CodeGen/RISCV/xtheadmac.ll b/llvm/test/CodeGen/RISCV/xtheadmac.ll index 992c88e3e6268..78d18101979b3 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmac.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmac.ll @@ -39,13 +39,13 @@ define i64 @mula_i64(i64 %a, i64 %b, i64 %c) { ; RV32XTHEADMAC: # %bb.0: ; RV32XTHEADMAC-NEXT: mulhu a6, a2, a4 ; RV32XTHEADMAC-NEXT: th.mula a6, a2, a5 +; RV32XTHEADMAC-NEXT: mv a5, a0 +; RV32XTHEADMAC-NEXT: th.mula a5, a2, a4 ; RV32XTHEADMAC-NEXT: th.mula a6, a3, a4 -; RV32XTHEADMAC-NEXT: mv a3, a0 -; RV32XTHEADMAC-NEXT: th.mula a3, a2, a4 -; RV32XTHEADMAC-NEXT: sltu a0, a3, a0 +; RV32XTHEADMAC-NEXT: sltu a0, a5, a0 ; RV32XTHEADMAC-NEXT: add a0, a1, a0 ; RV32XTHEADMAC-NEXT: add a1, a0, a6 -; RV32XTHEADMAC-NEXT: mv a0, a3 +; RV32XTHEADMAC-NEXT: mv a0, a5 ; RV32XTHEADMAC-NEXT: ret ; ; RV64XTHEADMAC-LABEL: mula_i64: @@ -98,11 +98,11 @@ define i64 @muls_i64(i64 %a, i64 %b, i64 %c) { ; RV32XTHEADMAC: # %bb.0: ; RV32XTHEADMAC-NEXT: mulhu a6, a2, a4 ; RV32XTHEADMAC-NEXT: th.mula a6, a2, a5 -; RV32XTHEADMAC-NEXT: th.mula a6, a3, a4 -; RV32XTHEADMAC-NEXT: mul a3, a2, a4 -; RV32XTHEADMAC-NEXT: sltu a3, a0, a3 +; RV32XTHEADMAC-NEXT: mul a5, a2, a4 +; RV32XTHEADMAC-NEXT: sltu a5, a0, a5 ; RV32XTHEADMAC-NEXT: th.muls a0, a2, a4 -; RV32XTHEADMAC-NEXT: sub a1, a1, a3 +; RV32XTHEADMAC-NEXT: th.mula a6, a3, a4 +; RV32XTHEADMAC-NEXT: sub a1, a1, a5 ; RV32XTHEADMAC-NEXT: sub a1, a1, a6 ; RV32XTHEADMAC-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index 46aa383866e93..e761fcb736a87 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1045,8 +1045,8 @@ define void @srd(ptr %a, i64 %b, i64 %c) { ; RV32XTHEADMEMIDX-LABEL: srd: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: add a2, a3, a3 -; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3 ; RV32XTHEADMEMIDX-NEXT: add a4, a4, a4 +; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3 ; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3 ; RV32XTHEADMEMIDX-NEXT: slli a4, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a4, a0, a4 @@ -1069,8 +1069,8 @@ define void @surd(ptr %a, i32 %b, i64 %c) { ; RV32XTHEADMEMIDX-LABEL: surd: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: add a4, a2, a2 -; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2 ; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 +; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2 ; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 ; RV32XTHEADMEMIDX-NEXT: slli a3, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a3, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index a4f56b6d28409..9a312d9daca8d 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -39,9 +39,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo2: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: mv a3, a2 +; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: mv a2, a1 ; RV32ZDINX-NEXT: fadd.d a2, a2, a2 -; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: sw a2, -3(a0) ; RV32ZDINX-NEXT: sw a3, 1(a0) ; RV32ZDINX-NEXT: ret @@ -49,9 +49,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINXUALIGNED-LABEL: foo2: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a2 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 ; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) ; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) ; RV32ZDINXUALIGNED-NEXT: ret @@ -183,10 +183,10 @@ entry: define void @foo6(ptr %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo6: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: lui a3, %hi(.LCPI5_0) -; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a3) -; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a3) ; RV32ZDINX-NEXT: mv a3, a2 +; RV32ZDINX-NEXT: lui a2, %hi(.LCPI5_0) +; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) +; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) ; RV32ZDINX-NEXT: mv a2, a1 ; RV32ZDINX-NEXT: fadd.d a2, a2, a4 ; RV32ZDINX-NEXT: addi a0, a0, 2047 @@ -196,10 +196,10 @@ define void @foo6(ptr %p, double %d) nounwind { ; ; RV32ZDINXUALIGNED-LABEL: foo6: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: lui a3, %hi(.LCPI5_0) -; RV32ZDINXUALIGNED-NEXT: lw a4, %lo(.LCPI5_0)(a3) -; RV32ZDINXUALIGNED-NEXT: lw a5, %lo(.LCPI5_0+4)(a3) ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(.LCPI5_0) +; RV32ZDINXUALIGNED-NEXT: lw a4, %lo(.LCPI5_0)(a2) +; RV32ZDINXUALIGNED-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a4 ; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047