From 99b2489b88e48d3f63015a29ba669641f8f06e64 Mon Sep 17 00:00:00 2001 From: Luke Quinn Date: Mon, 3 Feb 2025 05:54:51 -0800 Subject: [PATCH 1/3] [RISCV] PreTest: RISCVOptWInst Pass for GIsel, Add 3 tests required to show pre checks Signed-off-by: Luke Quinn --- .../RISCV/GlobalISel/div-by-constant.ll | 811 ++ .../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 3412 +++++ llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 962 ++ ...lar-shift-by-byte-multiple-legalization.ll | 10982 ++++++++++++++++ 4 files changed, 16167 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll new file mode 100644 index 0000000000000..e3616a79add9f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -0,0 +1,811 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck -check-prefixes=RV32,RV32IM %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+zba,+zbb \ +; RUN: -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck -check-prefixes=RV32,RV32IMZB %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck -check-prefixes=RV64,RV64IM %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb \ +; RUN: -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck -check-prefixes=RV64,RV64IMZB %s + +; Test that there is a single shift after the mul and no addition. +define i32 @udiv_constant_no_add(i32 %a) nounwind { +; RV32-LABEL: udiv_constant_no_add: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, 838861 +; RV32-NEXT: addi a1, a1, -819 +; RV32-NEXT: mulhu a0, a0, a1 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: ret +; +; RV64IM-LABEL: udiv_constant_no_add: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: lui a1, 205 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: addiw a1, a1, -819 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -819 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: srli a0, a0, 34 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: udiv_constant_no_add: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: zext.w a0, a0 +; RV64IMZB-NEXT: lui a1, 838861 +; RV64IMZB-NEXT: addi a1, a1, -819 +; RV64IMZB-NEXT: zext.w a1, a1 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: srli a0, a0, 34 +; RV64IMZB-NEXT: ret + %1 = udiv i32 %a, 5 + ret i32 %1 +} + +; This constant requires a sub, shrli, add sequence after the mul. +define i32 @udiv_constant_add(i32 %a) nounwind { +; RV32-LABEL: udiv_constant_add: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, 149797 +; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: ret +; +; RV64IM-LABEL: udiv_constant_add: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lui a1, 149797 +; RV64IM-NEXT: slli a2, a0, 32 +; RV64IM-NEXT: addiw a1, a1, -1755 +; RV64IM-NEXT: srli a2, a2, 32 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: srliw a0, a0, 1 +; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: srliw a0, a0, 2 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: udiv_constant_add: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: lui a1, 149797 +; RV64IMZB-NEXT: addiw a1, a1, -1755 +; RV64IMZB-NEXT: zext.w a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: srli a1, a1, 32 +; RV64IMZB-NEXT: sub a0, a0, a1 +; RV64IMZB-NEXT: srliw a0, a0, 1 +; RV64IMZB-NEXT: add a0, a0, a1 +; RV64IMZB-NEXT: srliw a0, a0, 2 +; RV64IMZB-NEXT: ret + %1 = udiv i32 %a, 7 + ret i32 %1 +} + +define i64 @udiv64_constant_no_add(i64 %a) nounwind { +; RV32-LABEL: udiv64_constant_no_add: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 838861 +; RV32-NEXT: mulhu a3, a0, zero +; RV32-NEXT: addi a4, a2, -819 +; RV32-NEXT: addi a2, a2, -820 +; RV32-NEXT: mul a5, a1, a4 +; RV32-NEXT: mul a6, a0, a2 +; RV32-NEXT: mulhu a7, a0, a4 +; RV32-NEXT: mul t0, zero, a4 +; RV32-NEXT: mul t1, a1, a2 +; RV32-NEXT: mulhu t2, a1, a4 +; RV32-NEXT: mulhu a0, a0, a2 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: mul a2, zero, a2 +; RV32-NEXT: mulhu a4, zero, a4 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: add a2, t0, a2 +; RV32-NEXT: add t0, t0, t1 +; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: sltu a4, a5, a6 +; RV32-NEXT: add a5, a5, a7 +; RV32-NEXT: sltu a6, t0, t1 +; RV32-NEXT: sltiu t1, t0, 0 +; RV32-NEXT: add t0, t0, t2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sltu a2, a5, a7 +; RV32-NEXT: add a6, a6, t1 +; RV32-NEXT: sltu a5, t0, t2 +; RV32-NEXT: add t0, t0, a0 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a0, t0, a0 +; RV32-NEXT: add a0, a5, a0 +; RV32-NEXT: add t0, t0, a2 +; RV32-NEXT: sltu a2, t0, a2 +; RV32-NEXT: srli a3, t0, 2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv64_constant_no_add: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 1035469 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: srli a0, a0, 2 +; RV64-NEXT: ret + %1 = udiv i64 %a, 5 + ret i64 %1 +} + +define i64 @udiv64_constant_add(i64 %a) nounwind { +; RV32-LABEL: udiv64_constant_add: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 599186 +; RV32-NEXT: lui a3, 149797 +; RV32-NEXT: mulhu a4, a0, zero +; RV32-NEXT: addi a2, a2, 1171 +; RV32-NEXT: addi a3, a3, -1756 +; RV32-NEXT: mul a5, a1, a2 +; RV32-NEXT: mul a6, a0, a3 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: mul t0, zero, a2 +; RV32-NEXT: mulhu t1, zero, a2 +; RV32-NEXT: mulhu t2, a1, a3 +; RV32-NEXT: add t1, t1, t2 +; RV32-NEXT: mul t2, zero, a3 +; RV32-NEXT: add t2, t0, t2 +; RV32-NEXT: add t1, t2, t1 +; RV32-NEXT: mul t2, a1, a3 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: mulhu a3, a0, a3 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: add t0, t0, t2 +; RV32-NEXT: sltu a6, a5, a6 +; RV32-NEXT: add a5, a5, a7 +; RV32-NEXT: sltu t2, t0, t2 +; RV32-NEXT: sltu a5, a5, a7 +; RV32-NEXT: sltiu a7, t0, 0 +; RV32-NEXT: add t0, t0, a2 +; RV32-NEXT: add a7, t2, a7 +; RV32-NEXT: sltu a2, t0, a2 +; RV32-NEXT: add t0, t0, a3 +; RV32-NEXT: add a4, t1, a4 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: add a2, a7, a2 +; RV32-NEXT: sltu a3, t0, a3 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add t0, t0, a5 +; RV32-NEXT: sltu a3, t0, a5 +; RV32-NEXT: sub a5, a0, t0 +; RV32-NEXT: sltu a0, a0, t0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: srli a5, a5, 1 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a0, a1, 31 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: or a0, a5, a0 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a0, a0, t0 +; RV32-NEXT: sltu a2, a0, t0 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv64_constant_add: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI3_0) +; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) +; RV64-NEXT: mulhu a1, a0, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: srli a0, a0, 2 +; RV64-NEXT: ret + %1 = udiv i64 %a, 7 + ret i64 %1 +} + +define i8 @udiv8_constant_no_add(i8 %a) nounwind { +; RV32-LABEL: udiv8_constant_no_add: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 255 +; RV32-NEXT: li a1, 205 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: srli a0, a0, 10 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv8_constant_no_add: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 255 +; RV64-NEXT: li a1, 205 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srli a0, a0, 10 +; RV64-NEXT: ret + %1 = udiv i8 %a, 5 + ret i8 %1 +} + +define i8 @udiv8_constant_add(i8 %a) nounwind { +; RV32-LABEL: udiv8_constant_add: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 37 +; RV32-NEXT: andi a2, a0, 255 +; RV32-NEXT: mul a1, a2, a1 +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: andi a0, a0, 255 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: andi a0, a0, 255 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: ret +; +; RV64-LABEL: udiv8_constant_add: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 37 +; RV64-NEXT: andi a2, a0, 255 +; RV64-NEXT: mul a1, a2, a1 +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: andi a0, a0, 255 +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: andi a0, a0, 255 +; RV64-NEXT: srli a0, a0, 2 +; RV64-NEXT: ret + %1 = udiv i8 %a, 7 + ret i8 %1 +} + +define i16 @udiv16_constant_no_add(i16 %a) nounwind { +; RV32IM-LABEL: udiv16_constant_no_add: +; RV32IM: # %bb.0: +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: lui a1, 13 +; RV32IM-NEXT: srli a0, a0, 16 +; RV32IM-NEXT: addi a1, a1, -819 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: srli a0, a0, 18 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: udiv16_constant_no_add: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: zext.h a0, a0 +; RV32IMZB-NEXT: lui a1, 13 +; RV32IMZB-NEXT: addi a1, a1, -819 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: srli a0, a0, 18 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: udiv16_constant_no_add: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: lui a1, 13 +; RV64IM-NEXT: srli a0, a0, 48 +; RV64IM-NEXT: addiw a1, a1, -819 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: srli a0, a0, 18 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: udiv16_constant_no_add: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: zext.h a0, a0 +; RV64IMZB-NEXT: lui a1, 13 +; RV64IMZB-NEXT: addiw a1, a1, -819 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: srli a0, a0, 18 +; RV64IMZB-NEXT: ret + %1 = udiv i16 %a, 5 + ret i16 %1 +} + +define i16 @udiv16_constant_add(i16 %a) nounwind { +; RV32IM-LABEL: udiv16_constant_add: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 2 +; RV32IM-NEXT: lui a2, 16 +; RV32IM-NEXT: addi a1, a1, 1171 +; RV32IM-NEXT: addi a2, a2, -1 +; RV32IM-NEXT: and a3, a0, a2 +; RV32IM-NEXT: mul a1, a3, a1 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: and a0, a0, a2 +; RV32IM-NEXT: srli a0, a0, 1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: and a0, a0, a2 +; RV32IM-NEXT: srli a0, a0, 2 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: udiv16_constant_add: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: lui a1, 2 +; RV32IMZB-NEXT: addi a1, a1, 1171 +; RV32IMZB-NEXT: zext.h a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: srli a1, a1, 16 +; RV32IMZB-NEXT: sub a0, a0, a1 +; RV32IMZB-NEXT: zext.h a0, a0 +; RV32IMZB-NEXT: srli a0, a0, 1 +; RV32IMZB-NEXT: add a0, a0, a1 +; RV32IMZB-NEXT: zext.h a0, a0 +; RV32IMZB-NEXT: srli a0, a0, 2 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: udiv16_constant_add: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lui a1, 2 +; RV64IM-NEXT: lui a2, 16 +; RV64IM-NEXT: addiw a1, a1, 1171 +; RV64IM-NEXT: addiw a2, a2, -1 +; RV64IM-NEXT: and a3, a0, a2 +; RV64IM-NEXT: mul a1, a3, a1 +; RV64IM-NEXT: srli a1, a1, 16 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: and a0, a0, a2 +; RV64IM-NEXT: srli a0, a0, 1 +; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: and a0, a0, a2 +; RV64IM-NEXT: srli a0, a0, 2 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: udiv16_constant_add: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: lui a1, 2 +; RV64IMZB-NEXT: addi a1, a1, 1171 +; RV64IMZB-NEXT: zext.h a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: srli a1, a1, 16 +; RV64IMZB-NEXT: sub a0, a0, a1 +; RV64IMZB-NEXT: zext.h a0, a0 +; RV64IMZB-NEXT: srli a0, a0, 1 +; RV64IMZB-NEXT: add a0, a0, a1 +; RV64IMZB-NEXT: zext.h a0, a0 +; RV64IMZB-NEXT: srli a0, a0, 2 +; RV64IMZB-NEXT: ret + %1 = udiv i16 %a, 7 + ret i16 %1 +} + +; Test the simplest case a srli and an add after the mul. No srai. +define i32 @sdiv_constant_no_srai(i32 %a) nounwind { +; RV32-LABEL: sdiv_constant_no_srai: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 3 +; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv_constant_no_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 3 +; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i32 %a, 3 + ret i32 %1 +} + +; This constant requires an srai between the mul and the add. +define i32 @sdiv_constant_srai(i32 %a) nounwind { +; RV32-LABEL: sdiv_constant_srai: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 5 +; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv_constant_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 5 +; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i32 %a, 5 + ret i32 %1 +} + +; This constant requires an add and an srai after the mul. +define i32 @sdiv_constant_add_srai(i32 %a) nounwind { +; RV32-LABEL: sdiv_constant_add_srai: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 7 +; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv_constant_add_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 7 +; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i32 %a, 7 + ret i32 %1 +} + +; This constant requires a sub and an srai after the mul. +define i32 @sdiv_constant_sub_srai(i32 %a) nounwind { +; RV32-LABEL: sdiv_constant_sub_srai: +; RV32: # %bb.0: +; RV32-NEXT: li a1, -7 +; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv_constant_sub_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, -7 +; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i32 %a, -7 + ret i32 %1 +} + +define i64 @sdiv64_constant_no_srai(i64 %a) nounwind { +; RV32-LABEL: sdiv64_constant_no_srai: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 3 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __divdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv64_constant_no_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 3 +; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i64 %a, 3 + ret i64 %1 +} + +define i64 @sdiv64_constant_srai(i64 %a) nounwind { +; RV32-LABEL: sdiv64_constant_srai: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 5 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __divdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv64_constant_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 5 +; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i64 %a, 5 + ret i64 %1 +} + +define i64 @sdiv64_constant_add_srai(i64 %a) nounwind { +; RV32-LABEL: sdiv64_constant_add_srai: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 15 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __divdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv64_constant_add_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 15 +; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i64 %a, 15 + ret i64 %1 +} + +define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind { +; RV32-LABEL: sdiv64_constant_sub_srai: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, -3 +; RV32-NEXT: li a3, -1 +; RV32-NEXT: call __divdi3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sdiv64_constant_sub_srai: +; RV64: # %bb.0: +; RV64-NEXT: li a1, -3 +; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: ret + %1 = sdiv i64 %a, -3 + ret i64 %1 +} + +define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { +; RV32IM-LABEL: sdiv8_constant_no_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv8_constant_no_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv8_constant_no_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv8_constant_no_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i8 %a, 3 + ret i8 %1 +} + +define i8 @sdiv8_constant_srai(i8 %a) nounwind { +; RV32IM-LABEL: sdiv8_constant_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv8_constant_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv8_constant_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv8_constant_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i8 %a, 5 + ret i8 %1 +} + +define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { +; RV32IM-LABEL: sdiv8_constant_add_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 7 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv8_constant_add_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 7 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv8_constant_add_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 7 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv8_constant_add_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 7 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i8 %a, 7 + ret i8 %1 +} + +define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { +; RV32IM-LABEL: sdiv8_constant_sub_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, -7 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv8_constant_sub_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, -7 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv8_constant_sub_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, -7 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv8_constant_sub_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, -7 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i8 %a, -7 + ret i8 %1 +} + +define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { +; RV32IM-LABEL: sdiv16_constant_no_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv16_constant_no_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv16_constant_no_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 48 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv16_constant_no_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i16 %a, 3 + ret i16 %1 +} + +define i16 @sdiv16_constant_srai(i16 %a) nounwind { +; RV32IM-LABEL: sdiv16_constant_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv16_constant_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv16_constant_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 48 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv16_constant_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i16 %a, 5 + ret i16 %1 +} + +define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { +; RV32IM-LABEL: sdiv16_constant_add_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 15 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv16_constant_add_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, 15 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv16_constant_add_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 15 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 48 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv16_constant_add_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, 15 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i16 %a, 15 + ret i16 %1 +} + +define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { +; RV32IM-LABEL: sdiv16_constant_sub_srai: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, -15 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: sdiv16_constant_sub_srai: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: li a1, -15 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: ret +; +; RV64IM-LABEL: sdiv16_constant_sub_srai: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, -15 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 48 +; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMZB-LABEL: sdiv16_constant_sub_srai: +; RV64IMZB: # %bb.0: +; RV64IMZB-NEXT: li a1, -15 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: ret + %1 = sdiv i16 %a, -15 + ret i16 %1 +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll new file mode 100644 index 0000000000000..46d1661983c6a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll @@ -0,0 +1,3412 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -enable-legalize-types-checking -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV64I +; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV32ZBB +; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV64ZBB +; RUN: llc -mtriple=riscv32 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV32XTHEADBB +; RUN: llc -mtriple=riscv64 -mattr=+xtheadbb -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV64XTHEADBB + +; NOTE: -enable-legalize-types-checking is on one command line due to a previous +; assertion failure on an expensive checks build for @rotr_32_mask_multiple. + +; These IR sequences are idioms for rotates. If rotate instructions are +; supported, they will be turned into ISD::ROTL or ISD::ROTR. + +define i32 @rotl_32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotl_32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: sll a1, a0, a1 +; RV32I-NEXT: srl a0, a0, a2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: sllw a1, a0, a1 +; RV64I-NEXT: srlw a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: rol a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rolw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: neg a2, a1 +; RV32XTHEADBB-NEXT: sll a1, a0, a1 +; RV32XTHEADBB-NEXT: srl a0, a0, a2 +; RV32XTHEADBB-NEXT: or a0, a1, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: sllw a1, a0, a1 +; RV64XTHEADBB-NEXT: srlw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 32, %y + %b = shl i32 %x, %y + %c = lshr i32 %x, %z + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotr_32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotr_32: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: srl a1, a0, a1 +; RV32I-NEXT: sll a0, a0, a2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: srlw a1, a0, a1 +; RV64I-NEXT: sllw a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ror a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rorw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: neg a2, a1 +; RV32XTHEADBB-NEXT: srl a1, a0, a1 +; RV32XTHEADBB-NEXT: sll a0, a0, a2 +; RV32XTHEADBB-NEXT: or a0, a1, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: srlw a1, a0, a1 +; RV64XTHEADBB-NEXT: sllw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 32, %y + %b = lshr i32 %x, %y + %c = shl i32 %x, %z + %d = or i32 %b, %c + ret i32 %d +} + +define i64 @rotl_64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotl_64: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a6, a2, 63 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: bltu a6, a4, .LBB2_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: sll a7, a0, a6 +; RV32I-NEXT: j .LBB2_3 +; RV32I-NEXT: .LBB2_2: +; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: neg a5, a6 +; RV32I-NEXT: srl a5, a0, a5 +; RV32I-NEXT: sll a7, a1, a2 +; RV32I-NEXT: or a7, a5, a7 +; RV32I-NEXT: .LBB2_3: +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: beqz a6, .LBB2_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a2, a7 +; RV32I-NEXT: .LBB2_5: +; RV32I-NEXT: andi a6, a5, 63 +; RV32I-NEXT: bltu a6, a4, .LBB2_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl a7, a1, a6 +; RV32I-NEXT: bnez a6, .LBB2_8 +; RV32I-NEXT: j .LBB2_9 +; RV32I-NEXT: .LBB2_7: +; RV32I-NEXT: srl a7, a0, a5 +; RV32I-NEXT: neg t0, a6 +; RV32I-NEXT: sll t0, a1, t0 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: beqz a6, .LBB2_9 +; RV32I-NEXT: .LBB2_8: +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: .LBB2_9: +; RV32I-NEXT: bltu a6, a4, .LBB2_11 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB2_12 +; RV32I-NEXT: .LBB2_11: +; RV32I-NEXT: srl a1, a1, a5 +; RV32I-NEXT: .LBB2_12: +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: sll a1, a0, a1 +; RV64I-NEXT: srl a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a6, a2, 63 +; RV32ZBB-NEXT: li a4, 32 +; RV32ZBB-NEXT: bltu a6, a4, .LBB2_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a3, 0 +; RV32ZBB-NEXT: sll a7, a0, a6 +; RV32ZBB-NEXT: j .LBB2_3 +; RV32ZBB-NEXT: .LBB2_2: +; RV32ZBB-NEXT: sll a3, a0, a2 +; RV32ZBB-NEXT: neg a5, a6 +; RV32ZBB-NEXT: srl a5, a0, a5 +; RV32ZBB-NEXT: sll a7, a1, a2 +; RV32ZBB-NEXT: or a7, a5, a7 +; RV32ZBB-NEXT: .LBB2_3: +; RV32ZBB-NEXT: neg a5, a2 +; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: beqz a6, .LBB2_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv a2, a7 +; RV32ZBB-NEXT: .LBB2_5: +; RV32ZBB-NEXT: andi a6, a5, 63 +; RV32ZBB-NEXT: bltu a6, a4, .LBB2_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: srl a7, a1, a6 +; RV32ZBB-NEXT: bnez a6, .LBB2_8 +; RV32ZBB-NEXT: j .LBB2_9 +; RV32ZBB-NEXT: .LBB2_7: +; RV32ZBB-NEXT: srl a7, a0, a5 +; RV32ZBB-NEXT: neg t0, a6 +; RV32ZBB-NEXT: sll t0, a1, t0 +; RV32ZBB-NEXT: or a7, a7, t0 +; RV32ZBB-NEXT: beqz a6, .LBB2_9 +; RV32ZBB-NEXT: .LBB2_8: +; RV32ZBB-NEXT: mv a0, a7 +; RV32ZBB-NEXT: .LBB2_9: +; RV32ZBB-NEXT: bltu a6, a4, .LBB2_11 +; RV32ZBB-NEXT: # %bb.10: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: j .LBB2_12 +; RV32ZBB-NEXT: .LBB2_11: +; RV32ZBB-NEXT: srl a1, a1, a5 +; RV32ZBB-NEXT: .LBB2_12: +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rol a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a6, a2, 63 +; RV32XTHEADBB-NEXT: li a4, 32 +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a3, 0 +; RV32XTHEADBB-NEXT: sll a7, a0, a6 +; RV32XTHEADBB-NEXT: j .LBB2_3 +; RV32XTHEADBB-NEXT: .LBB2_2: +; RV32XTHEADBB-NEXT: sll a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a5, a6 +; RV32XTHEADBB-NEXT: srl a5, a0, a5 +; RV32XTHEADBB-NEXT: sll a7, a1, a2 +; RV32XTHEADBB-NEXT: or a7, a5, a7 +; RV32XTHEADBB-NEXT: .LBB2_3: +; RV32XTHEADBB-NEXT: neg a5, a2 +; RV32XTHEADBB-NEXT: mv a2, a1 +; RV32XTHEADBB-NEXT: beqz a6, .LBB2_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv a2, a7 +; RV32XTHEADBB-NEXT: .LBB2_5: +; RV32XTHEADBB-NEXT: andi a6, a5, 63 +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_7 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: srl a7, a1, a6 +; RV32XTHEADBB-NEXT: bnez a6, .LBB2_8 +; RV32XTHEADBB-NEXT: j .LBB2_9 +; RV32XTHEADBB-NEXT: .LBB2_7: +; RV32XTHEADBB-NEXT: srl a7, a0, a5 +; RV32XTHEADBB-NEXT: neg t0, a6 +; RV32XTHEADBB-NEXT: sll t0, a1, t0 +; RV32XTHEADBB-NEXT: or a7, a7, t0 +; RV32XTHEADBB-NEXT: beqz a6, .LBB2_9 +; RV32XTHEADBB-NEXT: .LBB2_8: +; RV32XTHEADBB-NEXT: mv a0, a7 +; RV32XTHEADBB-NEXT: .LBB2_9: +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB2_11 +; RV32XTHEADBB-NEXT: # %bb.10: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: j .LBB2_12 +; RV32XTHEADBB-NEXT: .LBB2_11: +; RV32XTHEADBB-NEXT: srl a1, a1, a5 +; RV32XTHEADBB-NEXT: .LBB2_12: +; RV32XTHEADBB-NEXT: or a0, a3, a0 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: sll a1, a0, a1 +; RV64XTHEADBB-NEXT: srl a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i64 64, %y + %b = shl i64 %x, %y + %c = lshr i64 %x, %z + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotr_64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotr_64: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a5, a2, 63 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: bltu a5, a4, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a6, a1, a5 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: bnez a5, .LBB3_3 +; RV32I-NEXT: j .LBB3_4 +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: srl a3, a0, a2 +; RV32I-NEXT: neg a6, a5 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a6, a3, a6 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a5, .LBB3_4 +; RV32I-NEXT: .LBB3_3: +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: .LBB3_4: +; RV32I-NEXT: neg a6, a2 +; RV32I-NEXT: bltu a5, a4, .LBB3_7 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: andi a5, a6, 63 +; RV32I-NEXT: bgeu a5, a4, .LBB3_8 +; RV32I-NEXT: .LBB3_6: +; RV32I-NEXT: sll a4, a0, a6 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: srl a0, a0, a7 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: bnez a5, .LBB3_9 +; RV32I-NEXT: j .LBB3_10 +; RV32I-NEXT: .LBB3_7: +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: andi a5, a6, 63 +; RV32I-NEXT: bltu a5, a4, .LBB3_6 +; RV32I-NEXT: .LBB3_8: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: sll a0, a0, a5 +; RV32I-NEXT: beqz a5, .LBB3_10 +; RV32I-NEXT: .LBB3_9: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB3_10: +; RV32I-NEXT: or a0, a3, a4 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: srl a1, a0, a1 +; RV64I-NEXT: sll a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a5, a2, 63 +; RV32ZBB-NEXT: li a4, 32 +; RV32ZBB-NEXT: bltu a5, a4, .LBB3_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a6, a1, a5 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: bnez a5, .LBB3_3 +; RV32ZBB-NEXT: j .LBB3_4 +; RV32ZBB-NEXT: .LBB3_2: +; RV32ZBB-NEXT: srl a3, a0, a2 +; RV32ZBB-NEXT: neg a6, a5 +; RV32ZBB-NEXT: sll a6, a1, a6 +; RV32ZBB-NEXT: or a6, a3, a6 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: beqz a5, .LBB3_4 +; RV32ZBB-NEXT: .LBB3_3: +; RV32ZBB-NEXT: mv a3, a6 +; RV32ZBB-NEXT: .LBB3_4: +; RV32ZBB-NEXT: neg a6, a2 +; RV32ZBB-NEXT: bltu a5, a4, .LBB3_7 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: li a2, 0 +; RV32ZBB-NEXT: andi a5, a6, 63 +; RV32ZBB-NEXT: bgeu a5, a4, .LBB3_8 +; RV32ZBB-NEXT: .LBB3_6: +; RV32ZBB-NEXT: sll a4, a0, a6 +; RV32ZBB-NEXT: neg a7, a5 +; RV32ZBB-NEXT: srl a0, a0, a7 +; RV32ZBB-NEXT: sll a6, a1, a6 +; RV32ZBB-NEXT: or a0, a0, a6 +; RV32ZBB-NEXT: bnez a5, .LBB3_9 +; RV32ZBB-NEXT: j .LBB3_10 +; RV32ZBB-NEXT: .LBB3_7: +; RV32ZBB-NEXT: srl a2, a1, a2 +; RV32ZBB-NEXT: andi a5, a6, 63 +; RV32ZBB-NEXT: bltu a5, a4, .LBB3_6 +; RV32ZBB-NEXT: .LBB3_8: +; RV32ZBB-NEXT: li a4, 0 +; RV32ZBB-NEXT: sll a0, a0, a5 +; RV32ZBB-NEXT: beqz a5, .LBB3_10 +; RV32ZBB-NEXT: .LBB3_9: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB3_10: +; RV32ZBB-NEXT: or a0, a3, a4 +; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ror a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a5, a2, 63 +; RV32XTHEADBB-NEXT: li a4, 32 +; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a6, a1, a5 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: bnez a5, .LBB3_3 +; RV32XTHEADBB-NEXT: j .LBB3_4 +; RV32XTHEADBB-NEXT: .LBB3_2: +; RV32XTHEADBB-NEXT: srl a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a6, a5 +; RV32XTHEADBB-NEXT: sll a6, a1, a6 +; RV32XTHEADBB-NEXT: or a6, a3, a6 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: beqz a5, .LBB3_4 +; RV32XTHEADBB-NEXT: .LBB3_3: +; RV32XTHEADBB-NEXT: mv a3, a6 +; RV32XTHEADBB-NEXT: .LBB3_4: +; RV32XTHEADBB-NEXT: neg a6, a2 +; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_7 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: li a2, 0 +; RV32XTHEADBB-NEXT: andi a5, a6, 63 +; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB3_8 +; RV32XTHEADBB-NEXT: .LBB3_6: +; RV32XTHEADBB-NEXT: sll a4, a0, a6 +; RV32XTHEADBB-NEXT: neg a7, a5 +; RV32XTHEADBB-NEXT: srl a0, a0, a7 +; RV32XTHEADBB-NEXT: sll a6, a1, a6 +; RV32XTHEADBB-NEXT: or a0, a0, a6 +; RV32XTHEADBB-NEXT: bnez a5, .LBB3_9 +; RV32XTHEADBB-NEXT: j .LBB3_10 +; RV32XTHEADBB-NEXT: .LBB3_7: +; RV32XTHEADBB-NEXT: srl a2, a1, a2 +; RV32XTHEADBB-NEXT: andi a5, a6, 63 +; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB3_6 +; RV32XTHEADBB-NEXT: .LBB3_8: +; RV32XTHEADBB-NEXT: li a4, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, a5 +; RV32XTHEADBB-NEXT: beqz a5, .LBB3_10 +; RV32XTHEADBB-NEXT: .LBB3_9: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB3_10: +; RV32XTHEADBB-NEXT: or a0, a3, a4 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: srl a1, a0, a1 +; RV64XTHEADBB-NEXT: sll a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i64 64, %y + %b = lshr i64 %x, %y + %c = shl i64 %x, %z + %d = or i64 %b, %c + ret i64 %d +} + +define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotl_32_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: sll a1, a0, a1 +; RV32I-NEXT: srl a0, a0, a2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32_mask: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: sllw a1, a0, a1 +; RV64I-NEXT: srlw a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32_mask: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a2, a1 +; RV32ZBB-NEXT: sll a1, a0, a1 +; RV32ZBB-NEXT: srl a0, a0, a2 +; RV32ZBB-NEXT: or a0, a1, a0 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32_mask: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: sllw a1, a0, a1 +; RV64ZBB-NEXT: srlw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32_mask: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: neg a2, a1 +; RV32XTHEADBB-NEXT: sll a1, a0, a1 +; RV32XTHEADBB-NEXT: srl a0, a0, a2 +; RV32XTHEADBB-NEXT: or a0, a1, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32_mask: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: sllw a1, a0, a1 +; RV64XTHEADBB-NEXT: srlw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 0, %y + %and = and i32 %z, 31 + %b = shl i32 %x, %y + %c = lshr i32 %x, %and + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotl_32_mask_and_63_and_31: +; RV32I: # %bb.0: +; RV32I-NEXT: sll a2, a0, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32_mask_and_63_and_31: +; RV64I: # %bb.0: +; RV64I-NEXT: sllw a2, a0, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: srlw a0, a0, a1 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32_mask_and_63_and_31: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: sll a2, a0, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: srl a0, a0, a1 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sllw a2, a0, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: srlw a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: sll a2, a0, a1 +; RV32XTHEADBB-NEXT: neg a1, a1 +; RV32XTHEADBB-NEXT: srl a0, a0, a1 +; RV32XTHEADBB-NEXT: or a0, a2, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: sllw a2, a0, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: srlw a0, a0, a1 +; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: ret + %a = and i32 %y, 63 + %b = shl i32 %x, %a + %c = sub i32 0, %y + %d = and i32 %c, 31 + %e = lshr i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotl_32_mask_or_64_or_32: +; RV32I: # %bb.0: +; RV32I-NEXT: ori a1, a1, 64 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32_mask_or_64_or_32: +; RV64I: # %bb.0: +; RV64I-NEXT: ori a1, a1, 64 +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32_mask_or_64_or_32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ori a1, a1, 64 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32_mask_or_64_or_32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ori a1, a1, 64 +; RV64ZBB-NEXT: sllw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: ori a1, a1, 64 +; RV32XTHEADBB-NEXT: sll a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: ori a1, a1, 64 +; RV64XTHEADBB-NEXT: sllw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %a = or i32 %y, 64 + %b = shl i32 %x, %a + %c = sub i32 0, %y + %d = or i32 %c, 32 + %e = lshr i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotr_32_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: neg a2, a1 +; RV32I-NEXT: srl a1, a0, a1 +; RV32I-NEXT: sll a0, a0, a2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32_mask: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: srlw a1, a0, a1 +; RV64I-NEXT: sllw a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32_mask: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: neg a2, a1 +; RV32ZBB-NEXT: srl a1, a0, a1 +; RV32ZBB-NEXT: sll a0, a0, a2 +; RV32ZBB-NEXT: or a0, a1, a0 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32_mask: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: srlw a1, a0, a1 +; RV64ZBB-NEXT: sllw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32_mask: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: neg a2, a1 +; RV32XTHEADBB-NEXT: srl a1, a0, a1 +; RV32XTHEADBB-NEXT: sll a0, a0, a2 +; RV32XTHEADBB-NEXT: or a0, a1, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32_mask: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: srlw a1, a0, a1 +; RV64XTHEADBB-NEXT: sllw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 0, %y + %and = and i32 %z, 31 + %b = lshr i32 %x, %y + %c = shl i32 %x, %and + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotr_32_mask_and_63_and_31: +; RV32I: # %bb.0: +; RV32I-NEXT: srl a2, a0, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32_mask_and_63_and_31: +; RV64I: # %bb.0: +; RV64I-NEXT: srlw a2, a0, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32_mask_and_63_and_31: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: srl a2, a0, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: srlw a2, a0, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: sllw a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: srl a2, a0, a1 +; RV32XTHEADBB-NEXT: neg a1, a1 +; RV32XTHEADBB-NEXT: sll a0, a0, a1 +; RV32XTHEADBB-NEXT: or a0, a2, a0 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: srlw a2, a0, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: sllw a0, a0, a1 +; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: ret + %a = and i32 %y, 63 + %b = lshr i32 %x, %a + %c = sub i32 0, %y + %d = and i32 %c, 31 + %e = shl i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: rotr_32_mask_or_64_or_32: +; RV32I: # %bb.0: +; RV32I-NEXT: ori a1, a1, 64 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32_mask_or_64_or_32: +; RV64I: # %bb.0: +; RV64I-NEXT: ori a1, a1, 64 +; RV64I-NEXT: srlw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32_mask_or_64_or_32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ori a1, a1, 64 +; RV32ZBB-NEXT: srl a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32_mask_or_64_or_32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ori a1, a1, 64 +; RV64ZBB-NEXT: srlw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: ori a1, a1, 64 +; RV32XTHEADBB-NEXT: srl a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: ori a1, a1, 64 +; RV64XTHEADBB-NEXT: srlw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %a = or i32 %y, 64 + %b = lshr i32 %x, %a + %c = sub i32 0, %y + %d = or i32 %c, 32 + %e = shl i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotl_64_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: li a5, 32 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: bltu a2, a5, .LBB10_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: sll t0, a0, a2 +; RV32I-NEXT: j .LBB10_3 +; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: neg a6, a2 +; RV32I-NEXT: srl a6, a0, a6 +; RV32I-NEXT: sll a7, a1, a2 +; RV32I-NEXT: or t0, a6, a7 +; RV32I-NEXT: .LBB10_3: +; RV32I-NEXT: andi a7, a4, 63 +; RV32I-NEXT: mv a6, a1 +; RV32I-NEXT: beqz a2, .LBB10_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a6, t0 +; RV32I-NEXT: .LBB10_5: +; RV32I-NEXT: bltu a7, a5, .LBB10_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl a2, a1, a7 +; RV32I-NEXT: bnez a7, .LBB10_8 +; RV32I-NEXT: j .LBB10_9 +; RV32I-NEXT: .LBB10_7: +; RV32I-NEXT: srl a2, a0, a4 +; RV32I-NEXT: neg t0, a7 +; RV32I-NEXT: sll t0, a1, t0 +; RV32I-NEXT: or a2, a2, t0 +; RV32I-NEXT: beqz a7, .LBB10_9 +; RV32I-NEXT: .LBB10_8: +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: .LBB10_9: +; RV32I-NEXT: bltu a7, a5, .LBB10_11 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB10_12 +; RV32I-NEXT: .LBB10_11: +; RV32I-NEXT: srl a1, a1, a4 +; RV32I-NEXT: .LBB10_12: +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_mask: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: sll a1, a0, a1 +; RV64I-NEXT: srl a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_mask: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: li a5, 32 +; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: bltu a2, a5, .LBB10_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a3, 0 +; RV32ZBB-NEXT: sll t0, a0, a2 +; RV32ZBB-NEXT: j .LBB10_3 +; RV32ZBB-NEXT: .LBB10_2: +; RV32ZBB-NEXT: sll a3, a0, a2 +; RV32ZBB-NEXT: neg a6, a2 +; RV32ZBB-NEXT: srl a6, a0, a6 +; RV32ZBB-NEXT: sll a7, a1, a2 +; RV32ZBB-NEXT: or t0, a6, a7 +; RV32ZBB-NEXT: .LBB10_3: +; RV32ZBB-NEXT: andi a7, a4, 63 +; RV32ZBB-NEXT: mv a6, a1 +; RV32ZBB-NEXT: beqz a2, .LBB10_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv a6, t0 +; RV32ZBB-NEXT: .LBB10_5: +; RV32ZBB-NEXT: bltu a7, a5, .LBB10_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: srl a2, a1, a7 +; RV32ZBB-NEXT: bnez a7, .LBB10_8 +; RV32ZBB-NEXT: j .LBB10_9 +; RV32ZBB-NEXT: .LBB10_7: +; RV32ZBB-NEXT: srl a2, a0, a4 +; RV32ZBB-NEXT: neg t0, a7 +; RV32ZBB-NEXT: sll t0, a1, t0 +; RV32ZBB-NEXT: or a2, a2, t0 +; RV32ZBB-NEXT: beqz a7, .LBB10_9 +; RV32ZBB-NEXT: .LBB10_8: +; RV32ZBB-NEXT: mv a0, a2 +; RV32ZBB-NEXT: .LBB10_9: +; RV32ZBB-NEXT: bltu a7, a5, .LBB10_11 +; RV32ZBB-NEXT: # %bb.10: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: j .LBB10_12 +; RV32ZBB-NEXT: .LBB10_11: +; RV32ZBB-NEXT: srl a1, a1, a4 +; RV32ZBB-NEXT: .LBB10_12: +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: or a1, a6, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_mask: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: sll a1, a0, a1 +; RV64ZBB-NEXT: srl a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_mask: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: li a5, 32 +; RV32XTHEADBB-NEXT: neg a4, a2 +; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB10_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a3, 0 +; RV32XTHEADBB-NEXT: sll t0, a0, a2 +; RV32XTHEADBB-NEXT: j .LBB10_3 +; RV32XTHEADBB-NEXT: .LBB10_2: +; RV32XTHEADBB-NEXT: sll a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a6, a2 +; RV32XTHEADBB-NEXT: srl a6, a0, a6 +; RV32XTHEADBB-NEXT: sll a7, a1, a2 +; RV32XTHEADBB-NEXT: or t0, a6, a7 +; RV32XTHEADBB-NEXT: .LBB10_3: +; RV32XTHEADBB-NEXT: andi a7, a4, 63 +; RV32XTHEADBB-NEXT: mv a6, a1 +; RV32XTHEADBB-NEXT: beqz a2, .LBB10_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv a6, t0 +; RV32XTHEADBB-NEXT: .LBB10_5: +; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_7 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: srl a2, a1, a7 +; RV32XTHEADBB-NEXT: bnez a7, .LBB10_8 +; RV32XTHEADBB-NEXT: j .LBB10_9 +; RV32XTHEADBB-NEXT: .LBB10_7: +; RV32XTHEADBB-NEXT: srl a2, a0, a4 +; RV32XTHEADBB-NEXT: neg t0, a7 +; RV32XTHEADBB-NEXT: sll t0, a1, t0 +; RV32XTHEADBB-NEXT: or a2, a2, t0 +; RV32XTHEADBB-NEXT: beqz a7, .LBB10_9 +; RV32XTHEADBB-NEXT: .LBB10_8: +; RV32XTHEADBB-NEXT: mv a0, a2 +; RV32XTHEADBB-NEXT: .LBB10_9: +; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB10_11 +; RV32XTHEADBB-NEXT: # %bb.10: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: j .LBB10_12 +; RV32XTHEADBB-NEXT: .LBB10_11: +; RV32XTHEADBB-NEXT: srl a1, a1, a4 +; RV32XTHEADBB-NEXT: .LBB10_12: +; RV32XTHEADBB-NEXT: or a0, a3, a0 +; RV32XTHEADBB-NEXT: or a1, a6, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_mask: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: sll a1, a0, a1 +; RV64XTHEADBB-NEXT: srl a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i64 0, %y + %and = and i64 %z, 63 + %b = shl i64 %x, %y + %c = lshr i64 %x, %and + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotl_64_mask_and_127_and_63: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a6, a2, 127 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: bltu a6, a4, .LBB11_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: sll a7, a0, a6 +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: bnez a6, .LBB11_3 +; RV32I-NEXT: j .LBB11_4 +; RV32I-NEXT: .LBB11_2: +; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: neg a5, a6 +; RV32I-NEXT: srl a5, a0, a5 +; RV32I-NEXT: sll a7, a1, a2 +; RV32I-NEXT: or a7, a5, a7 +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: beqz a6, .LBB11_4 +; RV32I-NEXT: .LBB11_3: +; RV32I-NEXT: mv a5, a7 +; RV32I-NEXT: .LBB11_4: +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: andi a6, a2, 63 +; RV32I-NEXT: bltu a6, a4, .LBB11_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: srl a7, a1, a6 +; RV32I-NEXT: bnez a6, .LBB11_7 +; RV32I-NEXT: j .LBB11_8 +; RV32I-NEXT: .LBB11_6: +; RV32I-NEXT: srl a7, a0, a2 +; RV32I-NEXT: neg t0, a6 +; RV32I-NEXT: sll t0, a1, t0 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: beqz a6, .LBB11_8 +; RV32I-NEXT: .LBB11_7: +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: .LBB11_8: +; RV32I-NEXT: bltu a6, a4, .LBB11_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB11_11 +; RV32I-NEXT: .LBB11_10: +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: .LBB11_11: +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_mask_and_127_and_63: +; RV64I: # %bb.0: +; RV64I-NEXT: sll a2, a0, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_mask_and_127_and_63: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a6, a2, 127 +; RV32ZBB-NEXT: li a4, 32 +; RV32ZBB-NEXT: bltu a6, a4, .LBB11_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a3, 0 +; RV32ZBB-NEXT: sll a7, a0, a6 +; RV32ZBB-NEXT: mv a5, a1 +; RV32ZBB-NEXT: bnez a6, .LBB11_3 +; RV32ZBB-NEXT: j .LBB11_4 +; RV32ZBB-NEXT: .LBB11_2: +; RV32ZBB-NEXT: sll a3, a0, a2 +; RV32ZBB-NEXT: neg a5, a6 +; RV32ZBB-NEXT: srl a5, a0, a5 +; RV32ZBB-NEXT: sll a7, a1, a2 +; RV32ZBB-NEXT: or a7, a5, a7 +; RV32ZBB-NEXT: mv a5, a1 +; RV32ZBB-NEXT: beqz a6, .LBB11_4 +; RV32ZBB-NEXT: .LBB11_3: +; RV32ZBB-NEXT: mv a5, a7 +; RV32ZBB-NEXT: .LBB11_4: +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: andi a6, a2, 63 +; RV32ZBB-NEXT: bltu a6, a4, .LBB11_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: srl a7, a1, a6 +; RV32ZBB-NEXT: bnez a6, .LBB11_7 +; RV32ZBB-NEXT: j .LBB11_8 +; RV32ZBB-NEXT: .LBB11_6: +; RV32ZBB-NEXT: srl a7, a0, a2 +; RV32ZBB-NEXT: neg t0, a6 +; RV32ZBB-NEXT: sll t0, a1, t0 +; RV32ZBB-NEXT: or a7, a7, t0 +; RV32ZBB-NEXT: beqz a6, .LBB11_8 +; RV32ZBB-NEXT: .LBB11_7: +; RV32ZBB-NEXT: mv a0, a7 +; RV32ZBB-NEXT: .LBB11_8: +; RV32ZBB-NEXT: bltu a6, a4, .LBB11_10 +; RV32ZBB-NEXT: # %bb.9: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: j .LBB11_11 +; RV32ZBB-NEXT: .LBB11_10: +; RV32ZBB-NEXT: srl a1, a1, a2 +; RV32ZBB-NEXT: .LBB11_11: +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: or a1, a5, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sll a2, a0, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: srl a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a6, a2, 127 +; RV32XTHEADBB-NEXT: li a4, 32 +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a3, 0 +; RV32XTHEADBB-NEXT: sll a7, a0, a6 +; RV32XTHEADBB-NEXT: mv a5, a1 +; RV32XTHEADBB-NEXT: bnez a6, .LBB11_3 +; RV32XTHEADBB-NEXT: j .LBB11_4 +; RV32XTHEADBB-NEXT: .LBB11_2: +; RV32XTHEADBB-NEXT: sll a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a5, a6 +; RV32XTHEADBB-NEXT: srl a5, a0, a5 +; RV32XTHEADBB-NEXT: sll a7, a1, a2 +; RV32XTHEADBB-NEXT: or a7, a5, a7 +; RV32XTHEADBB-NEXT: mv a5, a1 +; RV32XTHEADBB-NEXT: beqz a6, .LBB11_4 +; RV32XTHEADBB-NEXT: .LBB11_3: +; RV32XTHEADBB-NEXT: mv a5, a7 +; RV32XTHEADBB-NEXT: .LBB11_4: +; RV32XTHEADBB-NEXT: neg a2, a2 +; RV32XTHEADBB-NEXT: andi a6, a2, 63 +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_6 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: srl a7, a1, a6 +; RV32XTHEADBB-NEXT: bnez a6, .LBB11_7 +; RV32XTHEADBB-NEXT: j .LBB11_8 +; RV32XTHEADBB-NEXT: .LBB11_6: +; RV32XTHEADBB-NEXT: srl a7, a0, a2 +; RV32XTHEADBB-NEXT: neg t0, a6 +; RV32XTHEADBB-NEXT: sll t0, a1, t0 +; RV32XTHEADBB-NEXT: or a7, a7, t0 +; RV32XTHEADBB-NEXT: beqz a6, .LBB11_8 +; RV32XTHEADBB-NEXT: .LBB11_7: +; RV32XTHEADBB-NEXT: mv a0, a7 +; RV32XTHEADBB-NEXT: .LBB11_8: +; RV32XTHEADBB-NEXT: bltu a6, a4, .LBB11_10 +; RV32XTHEADBB-NEXT: # %bb.9: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: j .LBB11_11 +; RV32XTHEADBB-NEXT: .LBB11_10: +; RV32XTHEADBB-NEXT: srl a1, a1, a2 +; RV32XTHEADBB-NEXT: .LBB11_11: +; RV32XTHEADBB-NEXT: or a0, a3, a0 +; RV32XTHEADBB-NEXT: or a1, a5, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: sll a2, a0, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: srl a0, a0, a1 +; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: ret + %a = and i64 %y, 127 + %b = shl i64 %x, %a + %c = sub i64 0, %y + %d = and i64 %c, 63 + %e = lshr i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotl_64_mask_or_128_or_64: +; RV32I: # %bb.0: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: ori a2, a2, 128 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: bltu a2, a0, .LBB12_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: bnez a2, .LBB12_3 +; RV32I-NEXT: j .LBB12_4 +; RV32I-NEXT: .LBB12_2: +; RV32I-NEXT: sll a0, a3, a2 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: srl a3, a3, a4 +; RV32I-NEXT: sll a4, a1, a2 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: beqz a2, .LBB12_4 +; RV32I-NEXT: .LBB12_3: +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: .LBB12_4: +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_mask_or_128_or_64: +; RV64I: # %bb.0: +; RV64I-NEXT: ori a1, a1, 128 +; RV64I-NEXT: sll a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_mask_or_128_or_64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: ori a2, a2, 128 +; RV32ZBB-NEXT: li a0, 32 +; RV32ZBB-NEXT: bltu a2, a0, .LBB12_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a0, 0 +; RV32ZBB-NEXT: sll a3, a3, a2 +; RV32ZBB-NEXT: bnez a2, .LBB12_3 +; RV32ZBB-NEXT: j .LBB12_4 +; RV32ZBB-NEXT: .LBB12_2: +; RV32ZBB-NEXT: sll a0, a3, a2 +; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: srl a3, a3, a4 +; RV32ZBB-NEXT: sll a4, a1, a2 +; RV32ZBB-NEXT: or a3, a3, a4 +; RV32ZBB-NEXT: beqz a2, .LBB12_4 +; RV32ZBB-NEXT: .LBB12_3: +; RV32ZBB-NEXT: mv a1, a3 +; RV32ZBB-NEXT: .LBB12_4: +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_mask_or_128_or_64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ori a1, a1, 128 +; RV64ZBB-NEXT: sll a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: ori a2, a2, 128 +; RV32XTHEADBB-NEXT: li a0, 32 +; RV32XTHEADBB-NEXT: bltu a2, a0, .LBB12_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a0, 0 +; RV32XTHEADBB-NEXT: sll a3, a3, a2 +; RV32XTHEADBB-NEXT: bnez a2, .LBB12_3 +; RV32XTHEADBB-NEXT: j .LBB12_4 +; RV32XTHEADBB-NEXT: .LBB12_2: +; RV32XTHEADBB-NEXT: sll a0, a3, a2 +; RV32XTHEADBB-NEXT: neg a4, a2 +; RV32XTHEADBB-NEXT: srl a3, a3, a4 +; RV32XTHEADBB-NEXT: sll a4, a1, a2 +; RV32XTHEADBB-NEXT: or a3, a3, a4 +; RV32XTHEADBB-NEXT: beqz a2, .LBB12_4 +; RV32XTHEADBB-NEXT: .LBB12_3: +; RV32XTHEADBB-NEXT: mv a1, a3 +; RV32XTHEADBB-NEXT: .LBB12_4: +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: ori a1, a1, 128 +; RV64XTHEADBB-NEXT: sll a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %a = or i64 %y, 128 + %b = shl i64 %x, %a + %c = sub i64 0, %y + %d = or i64 %c, 64 + %e = lshr i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotr_64_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: bltu a2, a4, .LBB13_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a5, a1, a2 +; RV32I-NEXT: j .LBB13_3 +; RV32I-NEXT: .LBB13_2: +; RV32I-NEXT: srl a3, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: .LBB13_3: +; RV32I-NEXT: neg a6, a2 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a2, .LBB13_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: .LBB13_5: +; RV32I-NEXT: andi a5, a6, 63 +; RV32I-NEXT: bltu a2, a4, .LBB13_8 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: bgeu a5, a4, .LBB13_9 +; RV32I-NEXT: .LBB13_7: +; RV32I-NEXT: sll a4, a0, a6 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: srl a0, a0, a7 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: bnez a5, .LBB13_10 +; RV32I-NEXT: j .LBB13_11 +; RV32I-NEXT: .LBB13_8: +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: bltu a5, a4, .LBB13_7 +; RV32I-NEXT: .LBB13_9: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: sll a0, a0, a5 +; RV32I-NEXT: beqz a5, .LBB13_11 +; RV32I-NEXT: .LBB13_10: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB13_11: +; RV32I-NEXT: or a0, a3, a4 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_mask: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: srl a1, a0, a1 +; RV64I-NEXT: sll a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_mask: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: li a4, 32 +; RV32ZBB-NEXT: bltu a2, a4, .LBB13_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a5, a1, a2 +; RV32ZBB-NEXT: j .LBB13_3 +; RV32ZBB-NEXT: .LBB13_2: +; RV32ZBB-NEXT: srl a3, a0, a2 +; RV32ZBB-NEXT: neg a5, a2 +; RV32ZBB-NEXT: sll a5, a1, a5 +; RV32ZBB-NEXT: or a5, a3, a5 +; RV32ZBB-NEXT: .LBB13_3: +; RV32ZBB-NEXT: neg a6, a2 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: beqz a2, .LBB13_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv a3, a5 +; RV32ZBB-NEXT: .LBB13_5: +; RV32ZBB-NEXT: andi a5, a6, 63 +; RV32ZBB-NEXT: bltu a2, a4, .LBB13_8 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: li a2, 0 +; RV32ZBB-NEXT: bgeu a5, a4, .LBB13_9 +; RV32ZBB-NEXT: .LBB13_7: +; RV32ZBB-NEXT: sll a4, a0, a6 +; RV32ZBB-NEXT: neg a7, a5 +; RV32ZBB-NEXT: srl a0, a0, a7 +; RV32ZBB-NEXT: sll a6, a1, a6 +; RV32ZBB-NEXT: or a0, a0, a6 +; RV32ZBB-NEXT: bnez a5, .LBB13_10 +; RV32ZBB-NEXT: j .LBB13_11 +; RV32ZBB-NEXT: .LBB13_8: +; RV32ZBB-NEXT: srl a2, a1, a2 +; RV32ZBB-NEXT: bltu a5, a4, .LBB13_7 +; RV32ZBB-NEXT: .LBB13_9: +; RV32ZBB-NEXT: li a4, 0 +; RV32ZBB-NEXT: sll a0, a0, a5 +; RV32ZBB-NEXT: beqz a5, .LBB13_11 +; RV32ZBB-NEXT: .LBB13_10: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB13_11: +; RV32ZBB-NEXT: or a0, a3, a4 +; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_mask: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: srl a1, a0, a1 +; RV64ZBB-NEXT: sll a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_mask: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: li a4, 32 +; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a5, a1, a2 +; RV32XTHEADBB-NEXT: j .LBB13_3 +; RV32XTHEADBB-NEXT: .LBB13_2: +; RV32XTHEADBB-NEXT: srl a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a5, a2 +; RV32XTHEADBB-NEXT: sll a5, a1, a5 +; RV32XTHEADBB-NEXT: or a5, a3, a5 +; RV32XTHEADBB-NEXT: .LBB13_3: +; RV32XTHEADBB-NEXT: neg a6, a2 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: beqz a2, .LBB13_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv a3, a5 +; RV32XTHEADBB-NEXT: .LBB13_5: +; RV32XTHEADBB-NEXT: andi a5, a6, 63 +; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB13_8 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: li a2, 0 +; RV32XTHEADBB-NEXT: bgeu a5, a4, .LBB13_9 +; RV32XTHEADBB-NEXT: .LBB13_7: +; RV32XTHEADBB-NEXT: sll a4, a0, a6 +; RV32XTHEADBB-NEXT: neg a7, a5 +; RV32XTHEADBB-NEXT: srl a0, a0, a7 +; RV32XTHEADBB-NEXT: sll a6, a1, a6 +; RV32XTHEADBB-NEXT: or a0, a0, a6 +; RV32XTHEADBB-NEXT: bnez a5, .LBB13_10 +; RV32XTHEADBB-NEXT: j .LBB13_11 +; RV32XTHEADBB-NEXT: .LBB13_8: +; RV32XTHEADBB-NEXT: srl a2, a1, a2 +; RV32XTHEADBB-NEXT: bltu a5, a4, .LBB13_7 +; RV32XTHEADBB-NEXT: .LBB13_9: +; RV32XTHEADBB-NEXT: li a4, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, a5 +; RV32XTHEADBB-NEXT: beqz a5, .LBB13_11 +; RV32XTHEADBB-NEXT: .LBB13_10: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB13_11: +; RV32XTHEADBB-NEXT: or a0, a3, a4 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_mask: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: srl a1, a0, a1 +; RV64XTHEADBB-NEXT: sll a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i64 0, %y + %and = and i64 %z, 63 + %b = lshr i64 %x, %y + %c = shl i64 %x, %and + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotr_64_mask_and_127_and_63: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a4, a2, 127 +; RV32I-NEXT: li a5, 32 +; RV32I-NEXT: bltu a4, a5, .LBB14_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a6, a1, a4 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: bnez a4, .LBB14_3 +; RV32I-NEXT: j .LBB14_4 +; RV32I-NEXT: .LBB14_2: +; RV32I-NEXT: srl a3, a0, a2 +; RV32I-NEXT: neg a6, a4 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a6, a3, a6 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a4, .LBB14_4 +; RV32I-NEXT: .LBB14_3: +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: .LBB14_4: +; RV32I-NEXT: bltu a4, a5, .LBB14_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: j .LBB14_7 +; RV32I-NEXT: .LBB14_6: +; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: .LBB14_7: +; RV32I-NEXT: neg a7, a2 +; RV32I-NEXT: andi a6, a7, 63 +; RV32I-NEXT: bltu a6, a5, .LBB14_9 +; RV32I-NEXT: # %bb.8: +; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: sll a0, a0, a6 +; RV32I-NEXT: bnez a6, .LBB14_10 +; RV32I-NEXT: j .LBB14_11 +; RV32I-NEXT: .LBB14_9: +; RV32I-NEXT: sll a2, a0, a7 +; RV32I-NEXT: neg a5, a6 +; RV32I-NEXT: srl a0, a0, a5 +; RV32I-NEXT: sll a5, a1, a7 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: beqz a6, .LBB14_11 +; RV32I-NEXT: .LBB14_10: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB14_11: +; RV32I-NEXT: or a0, a3, a2 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_mask_and_127_and_63: +; RV64I: # %bb.0: +; RV64I-NEXT: srl a2, a0, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: sll a0, a0, a1 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_mask_and_127_and_63: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a4, a2, 127 +; RV32ZBB-NEXT: li a5, 32 +; RV32ZBB-NEXT: bltu a4, a5, .LBB14_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a6, a1, a4 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: bnez a4, .LBB14_3 +; RV32ZBB-NEXT: j .LBB14_4 +; RV32ZBB-NEXT: .LBB14_2: +; RV32ZBB-NEXT: srl a3, a0, a2 +; RV32ZBB-NEXT: neg a6, a4 +; RV32ZBB-NEXT: sll a6, a1, a6 +; RV32ZBB-NEXT: or a6, a3, a6 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: beqz a4, .LBB14_4 +; RV32ZBB-NEXT: .LBB14_3: +; RV32ZBB-NEXT: mv a3, a6 +; RV32ZBB-NEXT: .LBB14_4: +; RV32ZBB-NEXT: bltu a4, a5, .LBB14_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: li a4, 0 +; RV32ZBB-NEXT: j .LBB14_7 +; RV32ZBB-NEXT: .LBB14_6: +; RV32ZBB-NEXT: srl a4, a1, a2 +; RV32ZBB-NEXT: .LBB14_7: +; RV32ZBB-NEXT: neg a7, a2 +; RV32ZBB-NEXT: andi a6, a7, 63 +; RV32ZBB-NEXT: bltu a6, a5, .LBB14_9 +; RV32ZBB-NEXT: # %bb.8: +; RV32ZBB-NEXT: li a2, 0 +; RV32ZBB-NEXT: sll a0, a0, a6 +; RV32ZBB-NEXT: bnez a6, .LBB14_10 +; RV32ZBB-NEXT: j .LBB14_11 +; RV32ZBB-NEXT: .LBB14_9: +; RV32ZBB-NEXT: sll a2, a0, a7 +; RV32ZBB-NEXT: neg a5, a6 +; RV32ZBB-NEXT: srl a0, a0, a5 +; RV32ZBB-NEXT: sll a5, a1, a7 +; RV32ZBB-NEXT: or a0, a0, a5 +; RV32ZBB-NEXT: beqz a6, .LBB14_11 +; RV32ZBB-NEXT: .LBB14_10: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB14_11: +; RV32ZBB-NEXT: or a0, a3, a2 +; RV32ZBB-NEXT: or a1, a4, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: srl a2, a0, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: sll a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a4, a2, 127 +; RV32XTHEADBB-NEXT: li a5, 32 +; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a6, a1, a4 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: bnez a4, .LBB14_3 +; RV32XTHEADBB-NEXT: j .LBB14_4 +; RV32XTHEADBB-NEXT: .LBB14_2: +; RV32XTHEADBB-NEXT: srl a3, a0, a2 +; RV32XTHEADBB-NEXT: neg a6, a4 +; RV32XTHEADBB-NEXT: sll a6, a1, a6 +; RV32XTHEADBB-NEXT: or a6, a3, a6 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: beqz a4, .LBB14_4 +; RV32XTHEADBB-NEXT: .LBB14_3: +; RV32XTHEADBB-NEXT: mv a3, a6 +; RV32XTHEADBB-NEXT: .LBB14_4: +; RV32XTHEADBB-NEXT: bltu a4, a5, .LBB14_6 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: li a4, 0 +; RV32XTHEADBB-NEXT: j .LBB14_7 +; RV32XTHEADBB-NEXT: .LBB14_6: +; RV32XTHEADBB-NEXT: srl a4, a1, a2 +; RV32XTHEADBB-NEXT: .LBB14_7: +; RV32XTHEADBB-NEXT: neg a7, a2 +; RV32XTHEADBB-NEXT: andi a6, a7, 63 +; RV32XTHEADBB-NEXT: bltu a6, a5, .LBB14_9 +; RV32XTHEADBB-NEXT: # %bb.8: +; RV32XTHEADBB-NEXT: li a2, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, a6 +; RV32XTHEADBB-NEXT: bnez a6, .LBB14_10 +; RV32XTHEADBB-NEXT: j .LBB14_11 +; RV32XTHEADBB-NEXT: .LBB14_9: +; RV32XTHEADBB-NEXT: sll a2, a0, a7 +; RV32XTHEADBB-NEXT: neg a5, a6 +; RV32XTHEADBB-NEXT: srl a0, a0, a5 +; RV32XTHEADBB-NEXT: sll a5, a1, a7 +; RV32XTHEADBB-NEXT: or a0, a0, a5 +; RV32XTHEADBB-NEXT: beqz a6, .LBB14_11 +; RV32XTHEADBB-NEXT: .LBB14_10: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB14_11: +; RV32XTHEADBB-NEXT: or a0, a3, a2 +; RV32XTHEADBB-NEXT: or a1, a4, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: srl a2, a0, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: sll a0, a0, a1 +; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: ret + %a = and i64 %y, 127 + %b = lshr i64 %x, %a + %c = sub i64 0, %y + %d = and i64 %c, 63 + %e = shl i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: rotr_64_mask_or_128_or_64: +; RV32I: # %bb.0: +; RV32I-NEXT: ori a2, a2, 128 +; RV32I-NEXT: li a3, 32 +; RV32I-NEXT: bltu a2, a3, .LBB15_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: bnez a2, .LBB15_3 +; RV32I-NEXT: j .LBB15_4 +; RV32I-NEXT: .LBB15_2: +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a2, .LBB15_4 +; RV32I-NEXT: .LBB15_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB15_4: +; RV32I-NEXT: bltu a2, a3, .LBB15_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB15_6: +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_mask_or_128_or_64: +; RV64I: # %bb.0: +; RV64I-NEXT: ori a1, a1, 128 +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_mask_or_128_or_64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ori a2, a2, 128 +; RV32ZBB-NEXT: li a3, 32 +; RV32ZBB-NEXT: bltu a2, a3, .LBB15_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a4, a1, a2 +; RV32ZBB-NEXT: bnez a2, .LBB15_3 +; RV32ZBB-NEXT: j .LBB15_4 +; RV32ZBB-NEXT: .LBB15_2: +; RV32ZBB-NEXT: srl a4, a0, a2 +; RV32ZBB-NEXT: neg a5, a2 +; RV32ZBB-NEXT: sll a5, a1, a5 +; RV32ZBB-NEXT: or a4, a4, a5 +; RV32ZBB-NEXT: beqz a2, .LBB15_4 +; RV32ZBB-NEXT: .LBB15_3: +; RV32ZBB-NEXT: mv a0, a4 +; RV32ZBB-NEXT: .LBB15_4: +; RV32ZBB-NEXT: bltu a2, a3, .LBB15_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: ret +; RV32ZBB-NEXT: .LBB15_6: +; RV32ZBB-NEXT: srl a1, a1, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_mask_or_128_or_64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ori a1, a1, 128 +; RV64ZBB-NEXT: srl a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: ori a2, a2, 128 +; RV32XTHEADBB-NEXT: li a3, 32 +; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a4, a1, a2 +; RV32XTHEADBB-NEXT: bnez a2, .LBB15_3 +; RV32XTHEADBB-NEXT: j .LBB15_4 +; RV32XTHEADBB-NEXT: .LBB15_2: +; RV32XTHEADBB-NEXT: srl a4, a0, a2 +; RV32XTHEADBB-NEXT: neg a5, a2 +; RV32XTHEADBB-NEXT: sll a5, a1, a5 +; RV32XTHEADBB-NEXT: or a4, a4, a5 +; RV32XTHEADBB-NEXT: beqz a2, .LBB15_4 +; RV32XTHEADBB-NEXT: .LBB15_3: +; RV32XTHEADBB-NEXT: mv a0, a4 +; RV32XTHEADBB-NEXT: .LBB15_4: +; RV32XTHEADBB-NEXT: bltu a2, a3, .LBB15_6 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: ret +; RV32XTHEADBB-NEXT: .LBB15_6: +; RV32XTHEADBB-NEXT: srl a1, a1, a2 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: ori a1, a1, 128 +; RV64XTHEADBB-NEXT: srl a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %a = or i64 %y, 128 + %b = lshr i64 %x, %a + %c = sub i64 0, %y + %d = or i64 %c, 64 + %e = shl i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +; Test that we're able to remove a mask on the rotate amount that has more than +; one use. +define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; RV32I-LABEL: rotl_32_mask_shared: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a3, a2, 31 +; RV32I-NEXT: sll a4, a0, a2 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sll a1, a1, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32_mask_shared: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 31 +; RV64I-NEXT: sllw a4, a0, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: srlw a0, a0, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: sllw a1, a1, a2 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32_mask_shared: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: rol a0, a0, a2 +; RV32ZBB-NEXT: sll a1, a1, a2 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32_mask_shared: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rolw a0, a0, a2 +; RV64ZBB-NEXT: sllw a1, a1, a2 +; RV64ZBB-NEXT: addw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32_mask_shared: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a3, a2, 31 +; RV32XTHEADBB-NEXT: sll a4, a0, a2 +; RV32XTHEADBB-NEXT: neg a3, a3 +; RV32XTHEADBB-NEXT: srl a0, a0, a3 +; RV32XTHEADBB-NEXT: or a0, a4, a0 +; RV32XTHEADBB-NEXT: sll a1, a1, a2 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32_mask_shared: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 31 +; RV64XTHEADBB-NEXT: sllw a4, a0, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: srlw a0, a0, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: sllw a1, a1, a2 +; RV64XTHEADBB-NEXT: addw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = shl i32 %b, %maskedamt + %3 = add i32 %1, %2 + ret i32 %3 +} +declare i32 @llvm.fshl.i32(i32, i32, i32) + +define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind { +; RV32I-LABEL: rotl_64_mask_shared: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a5, a4, 63 +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: bltu a5, t0, .LBB17_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll t3, a0, a5 +; RV32I-NEXT: j .LBB17_3 +; RV32I-NEXT: .LBB17_2: +; RV32I-NEXT: sll a6, a0, a4 +; RV32I-NEXT: srl t1, a0, a7 +; RV32I-NEXT: sll t2, a1, a4 +; RV32I-NEXT: or t3, t1, t2 +; RV32I-NEXT: .LBB17_3: +; RV32I-NEXT: neg t2, a5 +; RV32I-NEXT: mv t1, a1 +; RV32I-NEXT: beqz a5, .LBB17_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv t1, t3 +; RV32I-NEXT: .LBB17_5: +; RV32I-NEXT: andi t3, t2, 63 +; RV32I-NEXT: bltu t3, t0, .LBB17_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t4, a1, t3 +; RV32I-NEXT: bnez t3, .LBB17_8 +; RV32I-NEXT: j .LBB17_9 +; RV32I-NEXT: .LBB17_7: +; RV32I-NEXT: srl t4, a0, t2 +; RV32I-NEXT: neg t5, t3 +; RV32I-NEXT: sll t5, a1, t5 +; RV32I-NEXT: or t4, t4, t5 +; RV32I-NEXT: beqz t3, .LBB17_9 +; RV32I-NEXT: .LBB17_8: +; RV32I-NEXT: mv a0, t4 +; RV32I-NEXT: .LBB17_9: +; RV32I-NEXT: bltu t3, t0, .LBB17_12 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: bgeu a5, t0, .LBB17_13 +; RV32I-NEXT: .LBB17_11: +; RV32I-NEXT: sll t0, a2, a4 +; RV32I-NEXT: srl a2, a2, a7 +; RV32I-NEXT: sll a4, a3, a4 +; RV32I-NEXT: or a2, a2, a4 +; RV32I-NEXT: j .LBB17_14 +; RV32I-NEXT: .LBB17_12: +; RV32I-NEXT: srl a1, a1, t2 +; RV32I-NEXT: bltu a5, t0, .LBB17_11 +; RV32I-NEXT: .LBB17_13: +; RV32I-NEXT: li t0, 0 +; RV32I-NEXT: sll a2, a2, a5 +; RV32I-NEXT: .LBB17_14: +; RV32I-NEXT: or a0, a6, a0 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: beqz a5, .LBB17_16 +; RV32I-NEXT: # %bb.15: +; RV32I-NEXT: mv a3, a2 +; RV32I-NEXT: .LBB17_16: +; RV32I-NEXT: add a0, a0, t0 +; RV32I-NEXT: sltu a2, a0, t0 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_mask_shared: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 63 +; RV64I-NEXT: sll a4, a0, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: srl a0, a0, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: sll a1, a1, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_mask_shared: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a5, a4, 63 +; RV32ZBB-NEXT: li t0, 32 +; RV32ZBB-NEXT: neg a7, a5 +; RV32ZBB-NEXT: bltu a5, t0, .LBB17_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a6, 0 +; RV32ZBB-NEXT: sll t3, a0, a5 +; RV32ZBB-NEXT: j .LBB17_3 +; RV32ZBB-NEXT: .LBB17_2: +; RV32ZBB-NEXT: sll a6, a0, a4 +; RV32ZBB-NEXT: srl t1, a0, a7 +; RV32ZBB-NEXT: sll t2, a1, a4 +; RV32ZBB-NEXT: or t3, t1, t2 +; RV32ZBB-NEXT: .LBB17_3: +; RV32ZBB-NEXT: neg t2, a5 +; RV32ZBB-NEXT: mv t1, a1 +; RV32ZBB-NEXT: beqz a5, .LBB17_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv t1, t3 +; RV32ZBB-NEXT: .LBB17_5: +; RV32ZBB-NEXT: andi t3, t2, 63 +; RV32ZBB-NEXT: bltu t3, t0, .LBB17_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: srl t4, a1, t3 +; RV32ZBB-NEXT: bnez t3, .LBB17_8 +; RV32ZBB-NEXT: j .LBB17_9 +; RV32ZBB-NEXT: .LBB17_7: +; RV32ZBB-NEXT: srl t4, a0, t2 +; RV32ZBB-NEXT: neg t5, t3 +; RV32ZBB-NEXT: sll t5, a1, t5 +; RV32ZBB-NEXT: or t4, t4, t5 +; RV32ZBB-NEXT: beqz t3, .LBB17_9 +; RV32ZBB-NEXT: .LBB17_8: +; RV32ZBB-NEXT: mv a0, t4 +; RV32ZBB-NEXT: .LBB17_9: +; RV32ZBB-NEXT: bltu t3, t0, .LBB17_12 +; RV32ZBB-NEXT: # %bb.10: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: bgeu a5, t0, .LBB17_13 +; RV32ZBB-NEXT: .LBB17_11: +; RV32ZBB-NEXT: sll t0, a2, a4 +; RV32ZBB-NEXT: srl a2, a2, a7 +; RV32ZBB-NEXT: sll a4, a3, a4 +; RV32ZBB-NEXT: or a2, a2, a4 +; RV32ZBB-NEXT: j .LBB17_14 +; RV32ZBB-NEXT: .LBB17_12: +; RV32ZBB-NEXT: srl a1, a1, t2 +; RV32ZBB-NEXT: bltu a5, t0, .LBB17_11 +; RV32ZBB-NEXT: .LBB17_13: +; RV32ZBB-NEXT: li t0, 0 +; RV32ZBB-NEXT: sll a2, a2, a5 +; RV32ZBB-NEXT: .LBB17_14: +; RV32ZBB-NEXT: or a0, a6, a0 +; RV32ZBB-NEXT: or a1, t1, a1 +; RV32ZBB-NEXT: beqz a5, .LBB17_16 +; RV32ZBB-NEXT: # %bb.15: +; RV32ZBB-NEXT: mv a3, a2 +; RV32ZBB-NEXT: .LBB17_16: +; RV32ZBB-NEXT: add a0, a0, t0 +; RV32ZBB-NEXT: sltu a2, a0, t0 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_mask_shared: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rol a0, a0, a2 +; RV64ZBB-NEXT: sll a1, a1, a2 +; RV64ZBB-NEXT: add a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_mask_shared: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a5, a4, 63 +; RV32XTHEADBB-NEXT: li t0, 32 +; RV32XTHEADBB-NEXT: neg a7, a5 +; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a6, 0 +; RV32XTHEADBB-NEXT: sll t3, a0, a5 +; RV32XTHEADBB-NEXT: j .LBB17_3 +; RV32XTHEADBB-NEXT: .LBB17_2: +; RV32XTHEADBB-NEXT: sll a6, a0, a4 +; RV32XTHEADBB-NEXT: srl t1, a0, a7 +; RV32XTHEADBB-NEXT: sll t2, a1, a4 +; RV32XTHEADBB-NEXT: or t3, t1, t2 +; RV32XTHEADBB-NEXT: .LBB17_3: +; RV32XTHEADBB-NEXT: neg t2, a5 +; RV32XTHEADBB-NEXT: mv t1, a1 +; RV32XTHEADBB-NEXT: beqz a5, .LBB17_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv t1, t3 +; RV32XTHEADBB-NEXT: .LBB17_5: +; RV32XTHEADBB-NEXT: andi t3, t2, 63 +; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_7 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: srl t4, a1, t3 +; RV32XTHEADBB-NEXT: bnez t3, .LBB17_8 +; RV32XTHEADBB-NEXT: j .LBB17_9 +; RV32XTHEADBB-NEXT: .LBB17_7: +; RV32XTHEADBB-NEXT: srl t4, a0, t2 +; RV32XTHEADBB-NEXT: neg t5, t3 +; RV32XTHEADBB-NEXT: sll t5, a1, t5 +; RV32XTHEADBB-NEXT: or t4, t4, t5 +; RV32XTHEADBB-NEXT: beqz t3, .LBB17_9 +; RV32XTHEADBB-NEXT: .LBB17_8: +; RV32XTHEADBB-NEXT: mv a0, t4 +; RV32XTHEADBB-NEXT: .LBB17_9: +; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB17_12 +; RV32XTHEADBB-NEXT: # %bb.10: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: bgeu a5, t0, .LBB17_13 +; RV32XTHEADBB-NEXT: .LBB17_11: +; RV32XTHEADBB-NEXT: sll t0, a2, a4 +; RV32XTHEADBB-NEXT: srl a2, a2, a7 +; RV32XTHEADBB-NEXT: sll a4, a3, a4 +; RV32XTHEADBB-NEXT: or a2, a2, a4 +; RV32XTHEADBB-NEXT: j .LBB17_14 +; RV32XTHEADBB-NEXT: .LBB17_12: +; RV32XTHEADBB-NEXT: srl a1, a1, t2 +; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB17_11 +; RV32XTHEADBB-NEXT: .LBB17_13: +; RV32XTHEADBB-NEXT: li t0, 0 +; RV32XTHEADBB-NEXT: sll a2, a2, a5 +; RV32XTHEADBB-NEXT: .LBB17_14: +; RV32XTHEADBB-NEXT: or a0, a6, a0 +; RV32XTHEADBB-NEXT: or a1, t1, a1 +; RV32XTHEADBB-NEXT: beqz a5, .LBB17_16 +; RV32XTHEADBB-NEXT: # %bb.15: +; RV32XTHEADBB-NEXT: mv a3, a2 +; RV32XTHEADBB-NEXT: .LBB17_16: +; RV32XTHEADBB-NEXT: add a0, a0, t0 +; RV32XTHEADBB-NEXT: sltu a2, a0, t0 +; RV32XTHEADBB-NEXT: add a1, a1, a3 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_mask_shared: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 63 +; RV64XTHEADBB-NEXT: sll a4, a0, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: srl a0, a0, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: sll a1, a1, a2 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i64 %amt, 63 + %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt) + %2 = shl i64 %b, %maskedamt + %3 = add i64 %1, %2 + ret i64 %3 +} +declare i64 @llvm.fshl.i64(i64, i64, i64) + +define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; RV32I-LABEL: rotr_32_mask_shared: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a3, a2, 31 +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: sll a0, a0, a3 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sll a1, a1, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32_mask_shared: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 31 +; RV64I-NEXT: srlw a4, a0, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: sllw a0, a0, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: sllw a1, a1, a2 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32_mask_shared: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ror a0, a0, a2 +; RV32ZBB-NEXT: sll a1, a1, a2 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32_mask_shared: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rorw a0, a0, a2 +; RV64ZBB-NEXT: sllw a1, a1, a2 +; RV64ZBB-NEXT: addw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32_mask_shared: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a3, a2, 31 +; RV32XTHEADBB-NEXT: srl a4, a0, a2 +; RV32XTHEADBB-NEXT: neg a3, a3 +; RV32XTHEADBB-NEXT: sll a0, a0, a3 +; RV32XTHEADBB-NEXT: or a0, a4, a0 +; RV32XTHEADBB-NEXT: sll a1, a1, a2 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32_mask_shared: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 31 +; RV64XTHEADBB-NEXT: srlw a4, a0, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: sllw a0, a0, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: sllw a1, a1, a2 +; RV64XTHEADBB-NEXT: addw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = shl i32 %b, %maskedamt + %3 = add i32 %1, %2 + ret i32 %3 +} +declare i32 @llvm.fshr.i32(i32, i32, i32) + +define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind { +; RV32I-LABEL: rotr_64_mask_shared: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a5, a4, 63 +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: neg a6, a5 +; RV32I-NEXT: bltu a5, t0, .LBB19_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl t1, a1, a5 +; RV32I-NEXT: mv a7, a0 +; RV32I-NEXT: bnez a5, .LBB19_3 +; RV32I-NEXT: j .LBB19_4 +; RV32I-NEXT: .LBB19_2: +; RV32I-NEXT: srl a7, a0, a4 +; RV32I-NEXT: sll t1, a1, a6 +; RV32I-NEXT: or t1, a7, t1 +; RV32I-NEXT: mv a7, a0 +; RV32I-NEXT: beqz a5, .LBB19_4 +; RV32I-NEXT: .LBB19_3: +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: .LBB19_4: +; RV32I-NEXT: neg t4, a5 +; RV32I-NEXT: bltu a5, t0, .LBB19_7 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li t1, 0 +; RV32I-NEXT: andi t3, t4, 63 +; RV32I-NEXT: bgeu t3, t0, .LBB19_8 +; RV32I-NEXT: .LBB19_6: +; RV32I-NEXT: sll t2, a0, t4 +; RV32I-NEXT: neg t5, t3 +; RV32I-NEXT: srl a0, a0, t5 +; RV32I-NEXT: sll t4, a1, t4 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: bnez t3, .LBB19_9 +; RV32I-NEXT: j .LBB19_10 +; RV32I-NEXT: .LBB19_7: +; RV32I-NEXT: srl t1, a1, a4 +; RV32I-NEXT: andi t3, t4, 63 +; RV32I-NEXT: bltu t3, t0, .LBB19_6 +; RV32I-NEXT: .LBB19_8: +; RV32I-NEXT: li t2, 0 +; RV32I-NEXT: sll a0, a0, t3 +; RV32I-NEXT: beqz t3, .LBB19_10 +; RV32I-NEXT: .LBB19_9: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB19_10: +; RV32I-NEXT: bltu a5, t0, .LBB19_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t0, 0 +; RV32I-NEXT: sll a0, a2, a5 +; RV32I-NEXT: j .LBB19_13 +; RV32I-NEXT: .LBB19_12: +; RV32I-NEXT: sll t0, a2, a4 +; RV32I-NEXT: srl a0, a2, a6 +; RV32I-NEXT: sll a2, a3, a4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: .LBB19_13: +; RV32I-NEXT: or a2, a7, t2 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: beqz a5, .LBB19_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: .LBB19_15: +; RV32I-NEXT: add a0, a2, t0 +; RV32I-NEXT: sltu a2, a0, t0 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_mask_shared: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 63 +; RV64I-NEXT: srl a4, a0, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: sll a0, a0, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: sll a1, a1, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_mask_shared: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi a5, a4, 63 +; RV32ZBB-NEXT: li t0, 32 +; RV32ZBB-NEXT: neg a6, a5 +; RV32ZBB-NEXT: bltu a5, t0, .LBB19_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl t1, a1, a5 +; RV32ZBB-NEXT: mv a7, a0 +; RV32ZBB-NEXT: bnez a5, .LBB19_3 +; RV32ZBB-NEXT: j .LBB19_4 +; RV32ZBB-NEXT: .LBB19_2: +; RV32ZBB-NEXT: srl a7, a0, a4 +; RV32ZBB-NEXT: sll t1, a1, a6 +; RV32ZBB-NEXT: or t1, a7, t1 +; RV32ZBB-NEXT: mv a7, a0 +; RV32ZBB-NEXT: beqz a5, .LBB19_4 +; RV32ZBB-NEXT: .LBB19_3: +; RV32ZBB-NEXT: mv a7, t1 +; RV32ZBB-NEXT: .LBB19_4: +; RV32ZBB-NEXT: neg t4, a5 +; RV32ZBB-NEXT: bltu a5, t0, .LBB19_7 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: li t1, 0 +; RV32ZBB-NEXT: andi t3, t4, 63 +; RV32ZBB-NEXT: bgeu t3, t0, .LBB19_8 +; RV32ZBB-NEXT: .LBB19_6: +; RV32ZBB-NEXT: sll t2, a0, t4 +; RV32ZBB-NEXT: neg t5, t3 +; RV32ZBB-NEXT: srl a0, a0, t5 +; RV32ZBB-NEXT: sll t4, a1, t4 +; RV32ZBB-NEXT: or a0, a0, t4 +; RV32ZBB-NEXT: bnez t3, .LBB19_9 +; RV32ZBB-NEXT: j .LBB19_10 +; RV32ZBB-NEXT: .LBB19_7: +; RV32ZBB-NEXT: srl t1, a1, a4 +; RV32ZBB-NEXT: andi t3, t4, 63 +; RV32ZBB-NEXT: bltu t3, t0, .LBB19_6 +; RV32ZBB-NEXT: .LBB19_8: +; RV32ZBB-NEXT: li t2, 0 +; RV32ZBB-NEXT: sll a0, a0, t3 +; RV32ZBB-NEXT: beqz t3, .LBB19_10 +; RV32ZBB-NEXT: .LBB19_9: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB19_10: +; RV32ZBB-NEXT: bltu a5, t0, .LBB19_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: li t0, 0 +; RV32ZBB-NEXT: sll a0, a2, a5 +; RV32ZBB-NEXT: j .LBB19_13 +; RV32ZBB-NEXT: .LBB19_12: +; RV32ZBB-NEXT: sll t0, a2, a4 +; RV32ZBB-NEXT: srl a0, a2, a6 +; RV32ZBB-NEXT: sll a2, a3, a4 +; RV32ZBB-NEXT: or a0, a0, a2 +; RV32ZBB-NEXT: .LBB19_13: +; RV32ZBB-NEXT: or a2, a7, t2 +; RV32ZBB-NEXT: or a1, t1, a1 +; RV32ZBB-NEXT: beqz a5, .LBB19_15 +; RV32ZBB-NEXT: # %bb.14: +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: .LBB19_15: +; RV32ZBB-NEXT: add a0, a2, t0 +; RV32ZBB-NEXT: sltu a2, a0, t0 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_mask_shared: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ror a0, a0, a2 +; RV64ZBB-NEXT: sll a1, a1, a2 +; RV64ZBB-NEXT: add a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_mask_shared: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a5, a4, 63 +; RV32XTHEADBB-NEXT: li t0, 32 +; RV32XTHEADBB-NEXT: neg a6, a5 +; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl t1, a1, a5 +; RV32XTHEADBB-NEXT: mv a7, a0 +; RV32XTHEADBB-NEXT: bnez a5, .LBB19_3 +; RV32XTHEADBB-NEXT: j .LBB19_4 +; RV32XTHEADBB-NEXT: .LBB19_2: +; RV32XTHEADBB-NEXT: srl a7, a0, a4 +; RV32XTHEADBB-NEXT: sll t1, a1, a6 +; RV32XTHEADBB-NEXT: or t1, a7, t1 +; RV32XTHEADBB-NEXT: mv a7, a0 +; RV32XTHEADBB-NEXT: beqz a5, .LBB19_4 +; RV32XTHEADBB-NEXT: .LBB19_3: +; RV32XTHEADBB-NEXT: mv a7, t1 +; RV32XTHEADBB-NEXT: .LBB19_4: +; RV32XTHEADBB-NEXT: neg t4, a5 +; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_7 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: li t1, 0 +; RV32XTHEADBB-NEXT: andi t3, t4, 63 +; RV32XTHEADBB-NEXT: bgeu t3, t0, .LBB19_8 +; RV32XTHEADBB-NEXT: .LBB19_6: +; RV32XTHEADBB-NEXT: sll t2, a0, t4 +; RV32XTHEADBB-NEXT: neg t5, t3 +; RV32XTHEADBB-NEXT: srl a0, a0, t5 +; RV32XTHEADBB-NEXT: sll t4, a1, t4 +; RV32XTHEADBB-NEXT: or a0, a0, t4 +; RV32XTHEADBB-NEXT: bnez t3, .LBB19_9 +; RV32XTHEADBB-NEXT: j .LBB19_10 +; RV32XTHEADBB-NEXT: .LBB19_7: +; RV32XTHEADBB-NEXT: srl t1, a1, a4 +; RV32XTHEADBB-NEXT: andi t3, t4, 63 +; RV32XTHEADBB-NEXT: bltu t3, t0, .LBB19_6 +; RV32XTHEADBB-NEXT: .LBB19_8: +; RV32XTHEADBB-NEXT: li t2, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, t3 +; RV32XTHEADBB-NEXT: beqz t3, .LBB19_10 +; RV32XTHEADBB-NEXT: .LBB19_9: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB19_10: +; RV32XTHEADBB-NEXT: bltu a5, t0, .LBB19_12 +; RV32XTHEADBB-NEXT: # %bb.11: +; RV32XTHEADBB-NEXT: li t0, 0 +; RV32XTHEADBB-NEXT: sll a0, a2, a5 +; RV32XTHEADBB-NEXT: j .LBB19_13 +; RV32XTHEADBB-NEXT: .LBB19_12: +; RV32XTHEADBB-NEXT: sll t0, a2, a4 +; RV32XTHEADBB-NEXT: srl a0, a2, a6 +; RV32XTHEADBB-NEXT: sll a2, a3, a4 +; RV32XTHEADBB-NEXT: or a0, a0, a2 +; RV32XTHEADBB-NEXT: .LBB19_13: +; RV32XTHEADBB-NEXT: or a2, a7, t2 +; RV32XTHEADBB-NEXT: or a1, t1, a1 +; RV32XTHEADBB-NEXT: beqz a5, .LBB19_15 +; RV32XTHEADBB-NEXT: # %bb.14: +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: .LBB19_15: +; RV32XTHEADBB-NEXT: add a0, a2, t0 +; RV32XTHEADBB-NEXT: sltu a2, a0, t0 +; RV32XTHEADBB-NEXT: add a1, a1, a3 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_mask_shared: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 63 +; RV64XTHEADBB-NEXT: srl a4, a0, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: sll a0, a0, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: sll a1, a1, a2 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i64 %amt, 63 + %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt) + %2 = shl i64 %b, %maskedamt + %3 = add i64 %1, %2 + ret i64 %3 +} +declare i64 @llvm.fshr.i64(i64, i64, i64) + +define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; RV32I-LABEL: rotl_32_mask_multiple: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a3, a2, 31 +; RV32I-NEXT: sll a4, a0, a2 +; RV32I-NEXT: sll a2, a1, a2 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: srl a1, a1, a3 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_32_mask_multiple: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 31 +; RV64I-NEXT: sllw a4, a0, a2 +; RV64I-NEXT: sllw a2, a1, a2 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: srlw a0, a0, a5 +; RV64I-NEXT: srlw a1, a1, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_32_mask_multiple: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: rol a0, a0, a2 +; RV32ZBB-NEXT: rol a1, a1, a2 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_32_mask_multiple: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rolw a0, a0, a2 +; RV64ZBB-NEXT: rolw a1, a1, a2 +; RV64ZBB-NEXT: addw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_32_mask_multiple: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a3, a2, 31 +; RV32XTHEADBB-NEXT: sll a4, a0, a2 +; RV32XTHEADBB-NEXT: sll a2, a1, a2 +; RV32XTHEADBB-NEXT: neg a3, a3 +; RV32XTHEADBB-NEXT: srl a0, a0, a3 +; RV32XTHEADBB-NEXT: srl a1, a1, a3 +; RV32XTHEADBB-NEXT: or a0, a4, a0 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_32_mask_multiple: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 31 +; RV64XTHEADBB-NEXT: sllw a4, a0, a2 +; RV64XTHEADBB-NEXT: sllw a2, a1, a2 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: srlw a0, a0, a5 +; RV64XTHEADBB-NEXT: srlw a1, a1, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: or a1, a2, a1 +; RV64XTHEADBB-NEXT: addw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = tail call i32 @llvm.fshl.i32(i32 %b, i32 %b, i32 %maskedamt) + %3 = add i32 %1, %2 + ret i32 %3 +} + +define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { +; RV32I-LABEL: rotl_64_mask_multiple: +; RV32I: # %bb.0: +; RV32I-NEXT: andi t1, a4, 63 +; RV32I-NEXT: li a5, 32 +; RV32I-NEXT: neg t3, t1 +; RV32I-NEXT: bltu t1, a5, .LBB21_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll t2, a0, t1 +; RV32I-NEXT: j .LBB21_3 +; RV32I-NEXT: .LBB21_2: +; RV32I-NEXT: sll a6, a0, a4 +; RV32I-NEXT: srl a7, a0, t3 +; RV32I-NEXT: sll t0, a1, a4 +; RV32I-NEXT: or t2, a7, t0 +; RV32I-NEXT: .LBB21_3: +; RV32I-NEXT: neg a7, t1 +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: beqz t1, .LBB21_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv t0, t2 +; RV32I-NEXT: .LBB21_5: +; RV32I-NEXT: andi t2, a7, 63 +; RV32I-NEXT: neg t4, t2 +; RV32I-NEXT: bltu t2, a5, .LBB21_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t5, a1, t2 +; RV32I-NEXT: bnez t2, .LBB21_8 +; RV32I-NEXT: j .LBB21_9 +; RV32I-NEXT: .LBB21_7: +; RV32I-NEXT: srl t5, a0, a7 +; RV32I-NEXT: sll t6, a1, t4 +; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: beqz t2, .LBB21_9 +; RV32I-NEXT: .LBB21_8: +; RV32I-NEXT: mv a0, t5 +; RV32I-NEXT: .LBB21_9: +; RV32I-NEXT: bltu t2, a5, .LBB21_12 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: bgeu t1, a5, .LBB21_13 +; RV32I-NEXT: .LBB21_11: +; RV32I-NEXT: sll a1, a2, a4 +; RV32I-NEXT: srl t3, a2, t3 +; RV32I-NEXT: sll a4, a3, a4 +; RV32I-NEXT: or t3, t3, a4 +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: bnez t1, .LBB21_14 +; RV32I-NEXT: j .LBB21_15 +; RV32I-NEXT: .LBB21_12: +; RV32I-NEXT: srl t5, a1, a7 +; RV32I-NEXT: bltu t1, a5, .LBB21_11 +; RV32I-NEXT: .LBB21_13: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sll t3, a2, t1 +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: beqz t1, .LBB21_15 +; RV32I-NEXT: .LBB21_14: +; RV32I-NEXT: mv a4, t3 +; RV32I-NEXT: .LBB21_15: +; RV32I-NEXT: bltu t2, a5, .LBB21_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: srl t1, a3, t2 +; RV32I-NEXT: bnez t2, .LBB21_18 +; RV32I-NEXT: j .LBB21_19 +; RV32I-NEXT: .LBB21_17: +; RV32I-NEXT: srl t1, a2, a7 +; RV32I-NEXT: sll t3, a3, t4 +; RV32I-NEXT: or t1, t1, t3 +; RV32I-NEXT: beqz t2, .LBB21_19 +; RV32I-NEXT: .LBB21_18: +; RV32I-NEXT: mv a2, t1 +; RV32I-NEXT: .LBB21_19: +; RV32I-NEXT: or a0, a6, a0 +; RV32I-NEXT: or a6, t0, t5 +; RV32I-NEXT: bltu t2, a5, .LBB21_21 +; RV32I-NEXT: # %bb.20: +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: j .LBB21_22 +; RV32I-NEXT: .LBB21_21: +; RV32I-NEXT: srl a3, a3, a7 +; RV32I-NEXT: .LBB21_22: +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: sltu a1, a0, a1 +; RV32I-NEXT: add a3, a6, a3 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_mask_multiple: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 63 +; RV64I-NEXT: sll a4, a0, a2 +; RV64I-NEXT: sll a2, a1, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: srl a0, a0, a3 +; RV64I-NEXT: srl a1, a1, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_mask_multiple: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi t1, a4, 63 +; RV32ZBB-NEXT: li a5, 32 +; RV32ZBB-NEXT: neg t3, t1 +; RV32ZBB-NEXT: bltu t1, a5, .LBB21_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a6, 0 +; RV32ZBB-NEXT: sll t2, a0, t1 +; RV32ZBB-NEXT: j .LBB21_3 +; RV32ZBB-NEXT: .LBB21_2: +; RV32ZBB-NEXT: sll a6, a0, a4 +; RV32ZBB-NEXT: srl a7, a0, t3 +; RV32ZBB-NEXT: sll t0, a1, a4 +; RV32ZBB-NEXT: or t2, a7, t0 +; RV32ZBB-NEXT: .LBB21_3: +; RV32ZBB-NEXT: neg a7, t1 +; RV32ZBB-NEXT: mv t0, a1 +; RV32ZBB-NEXT: beqz t1, .LBB21_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv t0, t2 +; RV32ZBB-NEXT: .LBB21_5: +; RV32ZBB-NEXT: andi t2, a7, 63 +; RV32ZBB-NEXT: neg t4, t2 +; RV32ZBB-NEXT: bltu t2, a5, .LBB21_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: srl t5, a1, t2 +; RV32ZBB-NEXT: bnez t2, .LBB21_8 +; RV32ZBB-NEXT: j .LBB21_9 +; RV32ZBB-NEXT: .LBB21_7: +; RV32ZBB-NEXT: srl t5, a0, a7 +; RV32ZBB-NEXT: sll t6, a1, t4 +; RV32ZBB-NEXT: or t5, t5, t6 +; RV32ZBB-NEXT: beqz t2, .LBB21_9 +; RV32ZBB-NEXT: .LBB21_8: +; RV32ZBB-NEXT: mv a0, t5 +; RV32ZBB-NEXT: .LBB21_9: +; RV32ZBB-NEXT: bltu t2, a5, .LBB21_12 +; RV32ZBB-NEXT: # %bb.10: +; RV32ZBB-NEXT: li t5, 0 +; RV32ZBB-NEXT: bgeu t1, a5, .LBB21_13 +; RV32ZBB-NEXT: .LBB21_11: +; RV32ZBB-NEXT: sll a1, a2, a4 +; RV32ZBB-NEXT: srl t3, a2, t3 +; RV32ZBB-NEXT: sll a4, a3, a4 +; RV32ZBB-NEXT: or t3, t3, a4 +; RV32ZBB-NEXT: mv a4, a3 +; RV32ZBB-NEXT: bnez t1, .LBB21_14 +; RV32ZBB-NEXT: j .LBB21_15 +; RV32ZBB-NEXT: .LBB21_12: +; RV32ZBB-NEXT: srl t5, a1, a7 +; RV32ZBB-NEXT: bltu t1, a5, .LBB21_11 +; RV32ZBB-NEXT: .LBB21_13: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: sll t3, a2, t1 +; RV32ZBB-NEXT: mv a4, a3 +; RV32ZBB-NEXT: beqz t1, .LBB21_15 +; RV32ZBB-NEXT: .LBB21_14: +; RV32ZBB-NEXT: mv a4, t3 +; RV32ZBB-NEXT: .LBB21_15: +; RV32ZBB-NEXT: bltu t2, a5, .LBB21_17 +; RV32ZBB-NEXT: # %bb.16: +; RV32ZBB-NEXT: srl t1, a3, t2 +; RV32ZBB-NEXT: bnez t2, .LBB21_18 +; RV32ZBB-NEXT: j .LBB21_19 +; RV32ZBB-NEXT: .LBB21_17: +; RV32ZBB-NEXT: srl t1, a2, a7 +; RV32ZBB-NEXT: sll t3, a3, t4 +; RV32ZBB-NEXT: or t1, t1, t3 +; RV32ZBB-NEXT: beqz t2, .LBB21_19 +; RV32ZBB-NEXT: .LBB21_18: +; RV32ZBB-NEXT: mv a2, t1 +; RV32ZBB-NEXT: .LBB21_19: +; RV32ZBB-NEXT: or a0, a6, a0 +; RV32ZBB-NEXT: or a6, t0, t5 +; RV32ZBB-NEXT: bltu t2, a5, .LBB21_21 +; RV32ZBB-NEXT: # %bb.20: +; RV32ZBB-NEXT: li a3, 0 +; RV32ZBB-NEXT: j .LBB21_22 +; RV32ZBB-NEXT: .LBB21_21: +; RV32ZBB-NEXT: srl a3, a3, a7 +; RV32ZBB-NEXT: .LBB21_22: +; RV32ZBB-NEXT: or a1, a1, a2 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: sltu a1, a0, a1 +; RV32ZBB-NEXT: add a3, a6, a3 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_mask_multiple: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rol a0, a0, a2 +; RV64ZBB-NEXT: rol a1, a1, a2 +; RV64ZBB-NEXT: add a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_mask_multiple: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi t1, a4, 63 +; RV32XTHEADBB-NEXT: li a5, 32 +; RV32XTHEADBB-NEXT: neg t3, t1 +; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a6, 0 +; RV32XTHEADBB-NEXT: sll t2, a0, t1 +; RV32XTHEADBB-NEXT: j .LBB21_3 +; RV32XTHEADBB-NEXT: .LBB21_2: +; RV32XTHEADBB-NEXT: sll a6, a0, a4 +; RV32XTHEADBB-NEXT: srl a7, a0, t3 +; RV32XTHEADBB-NEXT: sll t0, a1, a4 +; RV32XTHEADBB-NEXT: or t2, a7, t0 +; RV32XTHEADBB-NEXT: .LBB21_3: +; RV32XTHEADBB-NEXT: neg a7, t1 +; RV32XTHEADBB-NEXT: mv t0, a1 +; RV32XTHEADBB-NEXT: beqz t1, .LBB21_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv t0, t2 +; RV32XTHEADBB-NEXT: .LBB21_5: +; RV32XTHEADBB-NEXT: andi t2, a7, 63 +; RV32XTHEADBB-NEXT: neg t4, t2 +; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_7 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: srl t5, a1, t2 +; RV32XTHEADBB-NEXT: bnez t2, .LBB21_8 +; RV32XTHEADBB-NEXT: j .LBB21_9 +; RV32XTHEADBB-NEXT: .LBB21_7: +; RV32XTHEADBB-NEXT: srl t5, a0, a7 +; RV32XTHEADBB-NEXT: sll t6, a1, t4 +; RV32XTHEADBB-NEXT: or t5, t5, t6 +; RV32XTHEADBB-NEXT: beqz t2, .LBB21_9 +; RV32XTHEADBB-NEXT: .LBB21_8: +; RV32XTHEADBB-NEXT: mv a0, t5 +; RV32XTHEADBB-NEXT: .LBB21_9: +; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_12 +; RV32XTHEADBB-NEXT: # %bb.10: +; RV32XTHEADBB-NEXT: li t5, 0 +; RV32XTHEADBB-NEXT: bgeu t1, a5, .LBB21_13 +; RV32XTHEADBB-NEXT: .LBB21_11: +; RV32XTHEADBB-NEXT: sll a1, a2, a4 +; RV32XTHEADBB-NEXT: srl t3, a2, t3 +; RV32XTHEADBB-NEXT: sll a4, a3, a4 +; RV32XTHEADBB-NEXT: or t3, t3, a4 +; RV32XTHEADBB-NEXT: mv a4, a3 +; RV32XTHEADBB-NEXT: bnez t1, .LBB21_14 +; RV32XTHEADBB-NEXT: j .LBB21_15 +; RV32XTHEADBB-NEXT: .LBB21_12: +; RV32XTHEADBB-NEXT: srl t5, a1, a7 +; RV32XTHEADBB-NEXT: bltu t1, a5, .LBB21_11 +; RV32XTHEADBB-NEXT: .LBB21_13: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: sll t3, a2, t1 +; RV32XTHEADBB-NEXT: mv a4, a3 +; RV32XTHEADBB-NEXT: beqz t1, .LBB21_15 +; RV32XTHEADBB-NEXT: .LBB21_14: +; RV32XTHEADBB-NEXT: mv a4, t3 +; RV32XTHEADBB-NEXT: .LBB21_15: +; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_17 +; RV32XTHEADBB-NEXT: # %bb.16: +; RV32XTHEADBB-NEXT: srl t1, a3, t2 +; RV32XTHEADBB-NEXT: bnez t2, .LBB21_18 +; RV32XTHEADBB-NEXT: j .LBB21_19 +; RV32XTHEADBB-NEXT: .LBB21_17: +; RV32XTHEADBB-NEXT: srl t1, a2, a7 +; RV32XTHEADBB-NEXT: sll t3, a3, t4 +; RV32XTHEADBB-NEXT: or t1, t1, t3 +; RV32XTHEADBB-NEXT: beqz t2, .LBB21_19 +; RV32XTHEADBB-NEXT: .LBB21_18: +; RV32XTHEADBB-NEXT: mv a2, t1 +; RV32XTHEADBB-NEXT: .LBB21_19: +; RV32XTHEADBB-NEXT: or a0, a6, a0 +; RV32XTHEADBB-NEXT: or a6, t0, t5 +; RV32XTHEADBB-NEXT: bltu t2, a5, .LBB21_21 +; RV32XTHEADBB-NEXT: # %bb.20: +; RV32XTHEADBB-NEXT: li a3, 0 +; RV32XTHEADBB-NEXT: j .LBB21_22 +; RV32XTHEADBB-NEXT: .LBB21_21: +; RV32XTHEADBB-NEXT: srl a3, a3, a7 +; RV32XTHEADBB-NEXT: .LBB21_22: +; RV32XTHEADBB-NEXT: or a1, a1, a2 +; RV32XTHEADBB-NEXT: or a3, a4, a3 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: sltu a1, a0, a1 +; RV32XTHEADBB-NEXT: add a3, a6, a3 +; RV32XTHEADBB-NEXT: add a1, a3, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_mask_multiple: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 63 +; RV64XTHEADBB-NEXT: sll a4, a0, a2 +; RV64XTHEADBB-NEXT: sll a2, a1, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: srl a0, a0, a3 +; RV64XTHEADBB-NEXT: srl a1, a1, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: or a1, a2, a1 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i64 %amt, 63 + %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %maskedamt) + %2 = tail call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 %maskedamt) + %3 = add i64 %1, %2 + ret i64 %3 +} + +define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; RV32I-LABEL: rotr_32_mask_multiple: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a3, a2, 31 +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: neg a3, a3 +; RV32I-NEXT: sll a0, a0, a3 +; RV32I-NEXT: sll a1, a1, a3 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_32_mask_multiple: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 31 +; RV64I-NEXT: srlw a4, a0, a2 +; RV64I-NEXT: srlw a2, a1, a2 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: sllw a0, a0, a5 +; RV64I-NEXT: sllw a1, a1, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_32_mask_multiple: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: ror a0, a0, a2 +; RV32ZBB-NEXT: ror a1, a1, a2 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_32_mask_multiple: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: rorw a0, a0, a2 +; RV64ZBB-NEXT: rorw a1, a1, a2 +; RV64ZBB-NEXT: addw a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_32_mask_multiple: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi a3, a2, 31 +; RV32XTHEADBB-NEXT: srl a4, a0, a2 +; RV32XTHEADBB-NEXT: srl a2, a1, a2 +; RV32XTHEADBB-NEXT: neg a3, a3 +; RV32XTHEADBB-NEXT: sll a0, a0, a3 +; RV32XTHEADBB-NEXT: sll a1, a1, a3 +; RV32XTHEADBB-NEXT: or a0, a4, a0 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_32_mask_multiple: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 31 +; RV64XTHEADBB-NEXT: srlw a4, a0, a2 +; RV64XTHEADBB-NEXT: srlw a2, a1, a2 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: sllw a0, a0, a5 +; RV64XTHEADBB-NEXT: sllw a1, a1, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: or a1, a2, a1 +; RV64XTHEADBB-NEXT: addw a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = tail call i32 @llvm.fshr.i32(i32 %b, i32 %b, i32 %maskedamt) + %3 = add i32 %1, %2 + ret i32 %3 +} + +define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { +; RV32I-LABEL: rotr_64_mask_multiple: +; RV32I: # %bb.0: +; RV32I-NEXT: andi t0, a4, 63 +; RV32I-NEXT: li a6, 32 +; RV32I-NEXT: neg t4, t0 +; RV32I-NEXT: bltu t0, a6, .LBB23_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a7, a1, t0 +; RV32I-NEXT: mv a5, a0 +; RV32I-NEXT: bnez t0, .LBB23_3 +; RV32I-NEXT: j .LBB23_4 +; RV32I-NEXT: .LBB23_2: +; RV32I-NEXT: srl a5, a0, a4 +; RV32I-NEXT: sll a7, a1, t4 +; RV32I-NEXT: or a7, a5, a7 +; RV32I-NEXT: mv a5, a0 +; RV32I-NEXT: beqz t0, .LBB23_4 +; RV32I-NEXT: .LBB23_3: +; RV32I-NEXT: mv a5, a7 +; RV32I-NEXT: .LBB23_4: +; RV32I-NEXT: neg t2, t0 +; RV32I-NEXT: bltu t0, a6, .LBB23_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: j .LBB23_7 +; RV32I-NEXT: .LBB23_6: +; RV32I-NEXT: srl a7, a1, a4 +; RV32I-NEXT: .LBB23_7: +; RV32I-NEXT: andi t1, t2, 63 +; RV32I-NEXT: neg t5, t1 +; RV32I-NEXT: bltu t1, a6, .LBB23_9 +; RV32I-NEXT: # %bb.8: +; RV32I-NEXT: li t3, 0 +; RV32I-NEXT: sll a0, a0, t1 +; RV32I-NEXT: bnez t1, .LBB23_10 +; RV32I-NEXT: j .LBB23_11 +; RV32I-NEXT: .LBB23_9: +; RV32I-NEXT: sll t3, a0, t2 +; RV32I-NEXT: srl a0, a0, t5 +; RV32I-NEXT: sll t6, a1, t2 +; RV32I-NEXT: or a0, a0, t6 +; RV32I-NEXT: beqz t1, .LBB23_11 +; RV32I-NEXT: .LBB23_10: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB23_11: +; RV32I-NEXT: bltu t0, a6, .LBB23_13 +; RV32I-NEXT: # %bb.12: +; RV32I-NEXT: srl t4, a3, t0 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: bnez t0, .LBB23_14 +; RV32I-NEXT: j .LBB23_15 +; RV32I-NEXT: .LBB23_13: +; RV32I-NEXT: srl a0, a2, a4 +; RV32I-NEXT: sll t4, a3, t4 +; RV32I-NEXT: or t4, a0, t4 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: beqz t0, .LBB23_15 +; RV32I-NEXT: .LBB23_14: +; RV32I-NEXT: mv a0, t4 +; RV32I-NEXT: .LBB23_15: +; RV32I-NEXT: bltu t0, a6, .LBB23_18 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: bgeu t1, a6, .LBB23_19 +; RV32I-NEXT: .LBB23_17: +; RV32I-NEXT: sll a6, a2, t2 +; RV32I-NEXT: srl a2, a2, t5 +; RV32I-NEXT: sll t0, a3, t2 +; RV32I-NEXT: or a2, a2, t0 +; RV32I-NEXT: j .LBB23_20 +; RV32I-NEXT: .LBB23_18: +; RV32I-NEXT: srl a4, a3, a4 +; RV32I-NEXT: bltu t1, a6, .LBB23_17 +; RV32I-NEXT: .LBB23_19: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll a2, a2, t1 +; RV32I-NEXT: .LBB23_20: +; RV32I-NEXT: or a5, a5, t3 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: beqz t1, .LBB23_22 +; RV32I-NEXT: # %bb.21: +; RV32I-NEXT: mv a3, a2 +; RV32I-NEXT: .LBB23_22: +; RV32I-NEXT: or a2, a0, a6 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: add a0, a5, a2 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_mask_multiple: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a3, a2, 63 +; RV64I-NEXT: srl a4, a0, a2 +; RV64I-NEXT: srl a2, a1, a2 +; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: sll a0, a0, a3 +; RV64I-NEXT: sll a1, a1, a3 +; RV64I-NEXT: or a0, a4, a0 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_mask_multiple: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: andi t0, a4, 63 +; RV32ZBB-NEXT: li a6, 32 +; RV32ZBB-NEXT: neg t4, t0 +; RV32ZBB-NEXT: bltu t0, a6, .LBB23_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a7, a1, t0 +; RV32ZBB-NEXT: mv a5, a0 +; RV32ZBB-NEXT: bnez t0, .LBB23_3 +; RV32ZBB-NEXT: j .LBB23_4 +; RV32ZBB-NEXT: .LBB23_2: +; RV32ZBB-NEXT: srl a5, a0, a4 +; RV32ZBB-NEXT: sll a7, a1, t4 +; RV32ZBB-NEXT: or a7, a5, a7 +; RV32ZBB-NEXT: mv a5, a0 +; RV32ZBB-NEXT: beqz t0, .LBB23_4 +; RV32ZBB-NEXT: .LBB23_3: +; RV32ZBB-NEXT: mv a5, a7 +; RV32ZBB-NEXT: .LBB23_4: +; RV32ZBB-NEXT: neg t2, t0 +; RV32ZBB-NEXT: bltu t0, a6, .LBB23_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: li a7, 0 +; RV32ZBB-NEXT: j .LBB23_7 +; RV32ZBB-NEXT: .LBB23_6: +; RV32ZBB-NEXT: srl a7, a1, a4 +; RV32ZBB-NEXT: .LBB23_7: +; RV32ZBB-NEXT: andi t1, t2, 63 +; RV32ZBB-NEXT: neg t5, t1 +; RV32ZBB-NEXT: bltu t1, a6, .LBB23_9 +; RV32ZBB-NEXT: # %bb.8: +; RV32ZBB-NEXT: li t3, 0 +; RV32ZBB-NEXT: sll a0, a0, t1 +; RV32ZBB-NEXT: bnez t1, .LBB23_10 +; RV32ZBB-NEXT: j .LBB23_11 +; RV32ZBB-NEXT: .LBB23_9: +; RV32ZBB-NEXT: sll t3, a0, t2 +; RV32ZBB-NEXT: srl a0, a0, t5 +; RV32ZBB-NEXT: sll t6, a1, t2 +; RV32ZBB-NEXT: or a0, a0, t6 +; RV32ZBB-NEXT: beqz t1, .LBB23_11 +; RV32ZBB-NEXT: .LBB23_10: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB23_11: +; RV32ZBB-NEXT: bltu t0, a6, .LBB23_13 +; RV32ZBB-NEXT: # %bb.12: +; RV32ZBB-NEXT: srl t4, a3, t0 +; RV32ZBB-NEXT: mv a0, a2 +; RV32ZBB-NEXT: bnez t0, .LBB23_14 +; RV32ZBB-NEXT: j .LBB23_15 +; RV32ZBB-NEXT: .LBB23_13: +; RV32ZBB-NEXT: srl a0, a2, a4 +; RV32ZBB-NEXT: sll t4, a3, t4 +; RV32ZBB-NEXT: or t4, a0, t4 +; RV32ZBB-NEXT: mv a0, a2 +; RV32ZBB-NEXT: beqz t0, .LBB23_15 +; RV32ZBB-NEXT: .LBB23_14: +; RV32ZBB-NEXT: mv a0, t4 +; RV32ZBB-NEXT: .LBB23_15: +; RV32ZBB-NEXT: bltu t0, a6, .LBB23_18 +; RV32ZBB-NEXT: # %bb.16: +; RV32ZBB-NEXT: li a4, 0 +; RV32ZBB-NEXT: bgeu t1, a6, .LBB23_19 +; RV32ZBB-NEXT: .LBB23_17: +; RV32ZBB-NEXT: sll a6, a2, t2 +; RV32ZBB-NEXT: srl a2, a2, t5 +; RV32ZBB-NEXT: sll t0, a3, t2 +; RV32ZBB-NEXT: or a2, a2, t0 +; RV32ZBB-NEXT: j .LBB23_20 +; RV32ZBB-NEXT: .LBB23_18: +; RV32ZBB-NEXT: srl a4, a3, a4 +; RV32ZBB-NEXT: bltu t1, a6, .LBB23_17 +; RV32ZBB-NEXT: .LBB23_19: +; RV32ZBB-NEXT: li a6, 0 +; RV32ZBB-NEXT: sll a2, a2, t1 +; RV32ZBB-NEXT: .LBB23_20: +; RV32ZBB-NEXT: or a5, a5, t3 +; RV32ZBB-NEXT: or a1, a7, a1 +; RV32ZBB-NEXT: beqz t1, .LBB23_22 +; RV32ZBB-NEXT: # %bb.21: +; RV32ZBB-NEXT: mv a3, a2 +; RV32ZBB-NEXT: .LBB23_22: +; RV32ZBB-NEXT: or a2, a0, a6 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: add a0, a5, a2 +; RV32ZBB-NEXT: sltu a2, a0, a2 +; RV32ZBB-NEXT: add a1, a1, a3 +; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_mask_multiple: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: ror a0, a0, a2 +; RV64ZBB-NEXT: ror a1, a1, a2 +; RV64ZBB-NEXT: add a0, a0, a1 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_mask_multiple: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: andi t0, a4, 63 +; RV32XTHEADBB-NEXT: li a6, 32 +; RV32XTHEADBB-NEXT: neg t4, t0 +; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a7, a1, t0 +; RV32XTHEADBB-NEXT: mv a5, a0 +; RV32XTHEADBB-NEXT: bnez t0, .LBB23_3 +; RV32XTHEADBB-NEXT: j .LBB23_4 +; RV32XTHEADBB-NEXT: .LBB23_2: +; RV32XTHEADBB-NEXT: srl a5, a0, a4 +; RV32XTHEADBB-NEXT: sll a7, a1, t4 +; RV32XTHEADBB-NEXT: or a7, a5, a7 +; RV32XTHEADBB-NEXT: mv a5, a0 +; RV32XTHEADBB-NEXT: beqz t0, .LBB23_4 +; RV32XTHEADBB-NEXT: .LBB23_3: +; RV32XTHEADBB-NEXT: mv a5, a7 +; RV32XTHEADBB-NEXT: .LBB23_4: +; RV32XTHEADBB-NEXT: neg t2, t0 +; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_6 +; RV32XTHEADBB-NEXT: # %bb.5: +; RV32XTHEADBB-NEXT: li a7, 0 +; RV32XTHEADBB-NEXT: j .LBB23_7 +; RV32XTHEADBB-NEXT: .LBB23_6: +; RV32XTHEADBB-NEXT: srl a7, a1, a4 +; RV32XTHEADBB-NEXT: .LBB23_7: +; RV32XTHEADBB-NEXT: andi t1, t2, 63 +; RV32XTHEADBB-NEXT: neg t5, t1 +; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_9 +; RV32XTHEADBB-NEXT: # %bb.8: +; RV32XTHEADBB-NEXT: li t3, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, t1 +; RV32XTHEADBB-NEXT: bnez t1, .LBB23_10 +; RV32XTHEADBB-NEXT: j .LBB23_11 +; RV32XTHEADBB-NEXT: .LBB23_9: +; RV32XTHEADBB-NEXT: sll t3, a0, t2 +; RV32XTHEADBB-NEXT: srl a0, a0, t5 +; RV32XTHEADBB-NEXT: sll t6, a1, t2 +; RV32XTHEADBB-NEXT: or a0, a0, t6 +; RV32XTHEADBB-NEXT: beqz t1, .LBB23_11 +; RV32XTHEADBB-NEXT: .LBB23_10: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB23_11: +; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_13 +; RV32XTHEADBB-NEXT: # %bb.12: +; RV32XTHEADBB-NEXT: srl t4, a3, t0 +; RV32XTHEADBB-NEXT: mv a0, a2 +; RV32XTHEADBB-NEXT: bnez t0, .LBB23_14 +; RV32XTHEADBB-NEXT: j .LBB23_15 +; RV32XTHEADBB-NEXT: .LBB23_13: +; RV32XTHEADBB-NEXT: srl a0, a2, a4 +; RV32XTHEADBB-NEXT: sll t4, a3, t4 +; RV32XTHEADBB-NEXT: or t4, a0, t4 +; RV32XTHEADBB-NEXT: mv a0, a2 +; RV32XTHEADBB-NEXT: beqz t0, .LBB23_15 +; RV32XTHEADBB-NEXT: .LBB23_14: +; RV32XTHEADBB-NEXT: mv a0, t4 +; RV32XTHEADBB-NEXT: .LBB23_15: +; RV32XTHEADBB-NEXT: bltu t0, a6, .LBB23_18 +; RV32XTHEADBB-NEXT: # %bb.16: +; RV32XTHEADBB-NEXT: li a4, 0 +; RV32XTHEADBB-NEXT: bgeu t1, a6, .LBB23_19 +; RV32XTHEADBB-NEXT: .LBB23_17: +; RV32XTHEADBB-NEXT: sll a6, a2, t2 +; RV32XTHEADBB-NEXT: srl a2, a2, t5 +; RV32XTHEADBB-NEXT: sll t0, a3, t2 +; RV32XTHEADBB-NEXT: or a2, a2, t0 +; RV32XTHEADBB-NEXT: j .LBB23_20 +; RV32XTHEADBB-NEXT: .LBB23_18: +; RV32XTHEADBB-NEXT: srl a4, a3, a4 +; RV32XTHEADBB-NEXT: bltu t1, a6, .LBB23_17 +; RV32XTHEADBB-NEXT: .LBB23_19: +; RV32XTHEADBB-NEXT: li a6, 0 +; RV32XTHEADBB-NEXT: sll a2, a2, t1 +; RV32XTHEADBB-NEXT: .LBB23_20: +; RV32XTHEADBB-NEXT: or a5, a5, t3 +; RV32XTHEADBB-NEXT: or a1, a7, a1 +; RV32XTHEADBB-NEXT: beqz t1, .LBB23_22 +; RV32XTHEADBB-NEXT: # %bb.21: +; RV32XTHEADBB-NEXT: mv a3, a2 +; RV32XTHEADBB-NEXT: .LBB23_22: +; RV32XTHEADBB-NEXT: or a2, a0, a6 +; RV32XTHEADBB-NEXT: or a3, a4, a3 +; RV32XTHEADBB-NEXT: add a0, a5, a2 +; RV32XTHEADBB-NEXT: sltu a2, a0, a2 +; RV32XTHEADBB-NEXT: add a1, a1, a3 +; RV32XTHEADBB-NEXT: add a1, a1, a2 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_mask_multiple: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: andi a3, a2, 63 +; RV64XTHEADBB-NEXT: srl a4, a0, a2 +; RV64XTHEADBB-NEXT: srl a2, a1, a2 +; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: sll a0, a0, a3 +; RV64XTHEADBB-NEXT: sll a1, a1, a3 +; RV64XTHEADBB-NEXT: or a0, a4, a0 +; RV64XTHEADBB-NEXT: or a1, a2, a1 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: ret + %maskedamt = and i64 %amt, 63 + %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %maskedamt) + %2 = tail call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 %maskedamt) + %3 = add i64 %1, %2 + ret i64 %3 +} + +define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { +; RV32I-LABEL: rotl_64_zext: +; RV32I: # %bb.0: +; RV32I-NEXT: li a6, 64 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: srl a7, a0, a5 +; RV32I-NEXT: bltu a2, a4, .LBB24_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: sll t1, a0, a2 +; RV32I-NEXT: j .LBB24_3 +; RV32I-NEXT: .LBB24_2: +; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: sll t0, a1, a2 +; RV32I-NEXT: or t1, a7, t0 +; RV32I-NEXT: .LBB24_3: +; RV32I-NEXT: sub t0, a6, a2 +; RV32I-NEXT: mv a6, a1 +; RV32I-NEXT: beqz a2, .LBB24_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: .LBB24_5: +; RV32I-NEXT: bltu t0, a4, .LBB24_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl a2, a1, t0 +; RV32I-NEXT: bnez t0, .LBB24_8 +; RV32I-NEXT: j .LBB24_9 +; RV32I-NEXT: .LBB24_7: +; RV32I-NEXT: neg a2, t0 +; RV32I-NEXT: sll a2, a1, a2 +; RV32I-NEXT: or a2, a7, a2 +; RV32I-NEXT: beqz t0, .LBB24_9 +; RV32I-NEXT: .LBB24_8: +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: .LBB24_9: +; RV32I-NEXT: bltu t0, a4, .LBB24_11 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB24_12 +; RV32I-NEXT: .LBB24_11: +; RV32I-NEXT: srl a1, a1, a5 +; RV32I-NEXT: .LBB24_12: +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotl_64_zext: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 64 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: sll a1, a0, a1 +; RV64I-NEXT: srl a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotl_64_zext: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: li a6, 64 +; RV32ZBB-NEXT: li a4, 32 +; RV32ZBB-NEXT: neg a5, a2 +; RV32ZBB-NEXT: srl a7, a0, a5 +; RV32ZBB-NEXT: bltu a2, a4, .LBB24_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: li a3, 0 +; RV32ZBB-NEXT: sll t1, a0, a2 +; RV32ZBB-NEXT: j .LBB24_3 +; RV32ZBB-NEXT: .LBB24_2: +; RV32ZBB-NEXT: sll a3, a0, a2 +; RV32ZBB-NEXT: sll t0, a1, a2 +; RV32ZBB-NEXT: or t1, a7, t0 +; RV32ZBB-NEXT: .LBB24_3: +; RV32ZBB-NEXT: sub t0, a6, a2 +; RV32ZBB-NEXT: mv a6, a1 +; RV32ZBB-NEXT: beqz a2, .LBB24_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv a6, t1 +; RV32ZBB-NEXT: .LBB24_5: +; RV32ZBB-NEXT: bltu t0, a4, .LBB24_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: srl a2, a1, t0 +; RV32ZBB-NEXT: bnez t0, .LBB24_8 +; RV32ZBB-NEXT: j .LBB24_9 +; RV32ZBB-NEXT: .LBB24_7: +; RV32ZBB-NEXT: neg a2, t0 +; RV32ZBB-NEXT: sll a2, a1, a2 +; RV32ZBB-NEXT: or a2, a7, a2 +; RV32ZBB-NEXT: beqz t0, .LBB24_9 +; RV32ZBB-NEXT: .LBB24_8: +; RV32ZBB-NEXT: mv a0, a2 +; RV32ZBB-NEXT: .LBB24_9: +; RV32ZBB-NEXT: bltu t0, a4, .LBB24_11 +; RV32ZBB-NEXT: # %bb.10: +; RV32ZBB-NEXT: li a1, 0 +; RV32ZBB-NEXT: j .LBB24_12 +; RV32ZBB-NEXT: .LBB24_11: +; RV32ZBB-NEXT: srl a1, a1, a5 +; RV32ZBB-NEXT: .LBB24_12: +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: or a1, a6, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotl_64_zext: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: li a2, 64 +; RV64ZBB-NEXT: sub a2, a2, a1 +; RV64ZBB-NEXT: sll a1, a0, a1 +; RV64ZBB-NEXT: srl a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotl_64_zext: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: li a6, 64 +; RV32XTHEADBB-NEXT: li a4, 32 +; RV32XTHEADBB-NEXT: neg a5, a2 +; RV32XTHEADBB-NEXT: srl a7, a0, a5 +; RV32XTHEADBB-NEXT: bltu a2, a4, .LBB24_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: li a3, 0 +; RV32XTHEADBB-NEXT: sll t1, a0, a2 +; RV32XTHEADBB-NEXT: j .LBB24_3 +; RV32XTHEADBB-NEXT: .LBB24_2: +; RV32XTHEADBB-NEXT: sll a3, a0, a2 +; RV32XTHEADBB-NEXT: sll t0, a1, a2 +; RV32XTHEADBB-NEXT: or t1, a7, t0 +; RV32XTHEADBB-NEXT: .LBB24_3: +; RV32XTHEADBB-NEXT: sub t0, a6, a2 +; RV32XTHEADBB-NEXT: mv a6, a1 +; RV32XTHEADBB-NEXT: beqz a2, .LBB24_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv a6, t1 +; RV32XTHEADBB-NEXT: .LBB24_5: +; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_7 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: srl a2, a1, t0 +; RV32XTHEADBB-NEXT: bnez t0, .LBB24_8 +; RV32XTHEADBB-NEXT: j .LBB24_9 +; RV32XTHEADBB-NEXT: .LBB24_7: +; RV32XTHEADBB-NEXT: neg a2, t0 +; RV32XTHEADBB-NEXT: sll a2, a1, a2 +; RV32XTHEADBB-NEXT: or a2, a7, a2 +; RV32XTHEADBB-NEXT: beqz t0, .LBB24_9 +; RV32XTHEADBB-NEXT: .LBB24_8: +; RV32XTHEADBB-NEXT: mv a0, a2 +; RV32XTHEADBB-NEXT: .LBB24_9: +; RV32XTHEADBB-NEXT: bltu t0, a4, .LBB24_11 +; RV32XTHEADBB-NEXT: # %bb.10: +; RV32XTHEADBB-NEXT: li a1, 0 +; RV32XTHEADBB-NEXT: j .LBB24_12 +; RV32XTHEADBB-NEXT: .LBB24_11: +; RV32XTHEADBB-NEXT: srl a1, a1, a5 +; RV32XTHEADBB-NEXT: .LBB24_12: +; RV32XTHEADBB-NEXT: or a0, a3, a0 +; RV32XTHEADBB-NEXT: or a1, a6, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotl_64_zext: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: li a2, 64 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 +; RV64XTHEADBB-NEXT: sll a1, a0, a1 +; RV64XTHEADBB-NEXT: srl a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 64, %y + %zext = zext i32 %z to i64 + %zexty = zext i32 %y to i64 + %b = shl i64 %x, %zexty + %c = lshr i64 %x, %zext + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { +; RV32I-LABEL: rotr_64_zext: +; RV32I: # %bb.0: +; RV32I-NEXT: li a5, 32 +; RV32I-NEXT: neg a6, a2 +; RV32I-NEXT: sll a4, a1, a6 +; RV32I-NEXT: bltu a2, a5, .LBB25_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a7, a1, a2 +; RV32I-NEXT: j .LBB25_3 +; RV32I-NEXT: .LBB25_2: +; RV32I-NEXT: srl a3, a0, a2 +; RV32I-NEXT: or a7, a3, a4 +; RV32I-NEXT: .LBB25_3: +; RV32I-NEXT: li t0, 64 +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a2, .LBB25_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: .LBB25_5: +; RV32I-NEXT: sub a7, t0, a2 +; RV32I-NEXT: bltu a2, a5, .LBB25_8 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: bgeu a7, a5, .LBB25_9 +; RV32I-NEXT: .LBB25_7: +; RV32I-NEXT: sll a5, a0, a6 +; RV32I-NEXT: neg a6, a7 +; RV32I-NEXT: srl a0, a0, a6 +; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: bnez a7, .LBB25_10 +; RV32I-NEXT: j .LBB25_11 +; RV32I-NEXT: .LBB25_8: +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: bltu a7, a5, .LBB25_7 +; RV32I-NEXT: .LBB25_9: +; RV32I-NEXT: li a5, 0 +; RV32I-NEXT: sll a0, a0, a7 +; RV32I-NEXT: beqz a7, .LBB25_11 +; RV32I-NEXT: .LBB25_10: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB25_11: +; RV32I-NEXT: or a0, a3, a5 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: rotr_64_zext: +; RV64I: # %bb.0: +; RV64I-NEXT: li a2, 64 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: srl a1, a0, a1 +; RV64I-NEXT: sll a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: rotr_64_zext: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: li a5, 32 +; RV32ZBB-NEXT: neg a6, a2 +; RV32ZBB-NEXT: sll a4, a1, a6 +; RV32ZBB-NEXT: bltu a2, a5, .LBB25_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: srl a7, a1, a2 +; RV32ZBB-NEXT: j .LBB25_3 +; RV32ZBB-NEXT: .LBB25_2: +; RV32ZBB-NEXT: srl a3, a0, a2 +; RV32ZBB-NEXT: or a7, a3, a4 +; RV32ZBB-NEXT: .LBB25_3: +; RV32ZBB-NEXT: li t0, 64 +; RV32ZBB-NEXT: mv a3, a0 +; RV32ZBB-NEXT: beqz a2, .LBB25_5 +; RV32ZBB-NEXT: # %bb.4: +; RV32ZBB-NEXT: mv a3, a7 +; RV32ZBB-NEXT: .LBB25_5: +; RV32ZBB-NEXT: sub a7, t0, a2 +; RV32ZBB-NEXT: bltu a2, a5, .LBB25_8 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: li a2, 0 +; RV32ZBB-NEXT: bgeu a7, a5, .LBB25_9 +; RV32ZBB-NEXT: .LBB25_7: +; RV32ZBB-NEXT: sll a5, a0, a6 +; RV32ZBB-NEXT: neg a6, a7 +; RV32ZBB-NEXT: srl a0, a0, a6 +; RV32ZBB-NEXT: or a0, a0, a4 +; RV32ZBB-NEXT: bnez a7, .LBB25_10 +; RV32ZBB-NEXT: j .LBB25_11 +; RV32ZBB-NEXT: .LBB25_8: +; RV32ZBB-NEXT: srl a2, a1, a2 +; RV32ZBB-NEXT: bltu a7, a5, .LBB25_7 +; RV32ZBB-NEXT: .LBB25_9: +; RV32ZBB-NEXT: li a5, 0 +; RV32ZBB-NEXT: sll a0, a0, a7 +; RV32ZBB-NEXT: beqz a7, .LBB25_11 +; RV32ZBB-NEXT: .LBB25_10: +; RV32ZBB-NEXT: mv a1, a0 +; RV32ZBB-NEXT: .LBB25_11: +; RV32ZBB-NEXT: or a0, a3, a5 +; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: rotr_64_zext: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: li a2, 64 +; RV64ZBB-NEXT: sub a2, a2, a1 +; RV64ZBB-NEXT: srl a1, a0, a1 +; RV64ZBB-NEXT: sll a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 +; RV64ZBB-NEXT: ret +; +; RV32XTHEADBB-LABEL: rotr_64_zext: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: li a5, 32 +; RV32XTHEADBB-NEXT: neg a6, a2 +; RV32XTHEADBB-NEXT: sll a4, a1, a6 +; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_2 +; RV32XTHEADBB-NEXT: # %bb.1: +; RV32XTHEADBB-NEXT: srl a7, a1, a2 +; RV32XTHEADBB-NEXT: j .LBB25_3 +; RV32XTHEADBB-NEXT: .LBB25_2: +; RV32XTHEADBB-NEXT: srl a3, a0, a2 +; RV32XTHEADBB-NEXT: or a7, a3, a4 +; RV32XTHEADBB-NEXT: .LBB25_3: +; RV32XTHEADBB-NEXT: li t0, 64 +; RV32XTHEADBB-NEXT: mv a3, a0 +; RV32XTHEADBB-NEXT: beqz a2, .LBB25_5 +; RV32XTHEADBB-NEXT: # %bb.4: +; RV32XTHEADBB-NEXT: mv a3, a7 +; RV32XTHEADBB-NEXT: .LBB25_5: +; RV32XTHEADBB-NEXT: sub a7, t0, a2 +; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB25_8 +; RV32XTHEADBB-NEXT: # %bb.6: +; RV32XTHEADBB-NEXT: li a2, 0 +; RV32XTHEADBB-NEXT: bgeu a7, a5, .LBB25_9 +; RV32XTHEADBB-NEXT: .LBB25_7: +; RV32XTHEADBB-NEXT: sll a5, a0, a6 +; RV32XTHEADBB-NEXT: neg a6, a7 +; RV32XTHEADBB-NEXT: srl a0, a0, a6 +; RV32XTHEADBB-NEXT: or a0, a0, a4 +; RV32XTHEADBB-NEXT: bnez a7, .LBB25_10 +; RV32XTHEADBB-NEXT: j .LBB25_11 +; RV32XTHEADBB-NEXT: .LBB25_8: +; RV32XTHEADBB-NEXT: srl a2, a1, a2 +; RV32XTHEADBB-NEXT: bltu a7, a5, .LBB25_7 +; RV32XTHEADBB-NEXT: .LBB25_9: +; RV32XTHEADBB-NEXT: li a5, 0 +; RV32XTHEADBB-NEXT: sll a0, a0, a7 +; RV32XTHEADBB-NEXT: beqz a7, .LBB25_11 +; RV32XTHEADBB-NEXT: .LBB25_10: +; RV32XTHEADBB-NEXT: mv a1, a0 +; RV32XTHEADBB-NEXT: .LBB25_11: +; RV32XTHEADBB-NEXT: or a0, a3, a5 +; RV32XTHEADBB-NEXT: or a1, a2, a1 +; RV32XTHEADBB-NEXT: ret +; +; RV64XTHEADBB-LABEL: rotr_64_zext: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: li a2, 64 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 +; RV64XTHEADBB-NEXT: srl a1, a0, a1 +; RV64XTHEADBB-NEXT: sll a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 +; RV64XTHEADBB-NEXT: ret + %z = sub i32 64, %y + %zext = zext i32 %z to i64 + %zexty = zext i32 %y to i64 + %b = lshr i64 %x, %zexty + %c = shl i64 %x, %zext + %d = or i64 %b, %c + ret i64 %d +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll new file mode 100644 index 0000000000000..71a5ecc77a1b0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll @@ -0,0 +1,962 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s \ +; RUN: | FileCheck %s -check-prefix=RV64I + +; Basic shift support is tested as part of ALU.ll. This file ensures that +; shifts which may not be supported natively are lowered properly. + +declare i64 @llvm.fshr.i64(i64, i64, i64) +declare i128 @llvm.fshr.i128(i128, i128, i128) + +define i64 @lshr64(i64 %a, i64 %b) nounwind { +; RV32I-LABEL: lshr64: +; RV32I: # %bb.0: +; RV32I-NEXT: li a3, 32 +; RV32I-NEXT: bltu a2, a3, .LBB0_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: bnez a2, .LBB0_3 +; RV32I-NEXT: j .LBB0_4 +; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a2, .LBB0_4 +; RV32I-NEXT: .LBB0_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB0_4: +; RV32I-NEXT: bltu a2, a3, .LBB0_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB0_6: +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: lshr64: +; RV64I: # %bb.0: +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: ret + %1 = lshr i64 %a, %b + ret i64 %1 +} + +define i64 @lshr64_minsize(i64 %a, i64 %b) minsize nounwind { +; RV32I-LABEL: lshr64_minsize: +; RV32I: # %bb.0: +; RV32I-NEXT: li a3, 32 +; RV32I-NEXT: bltu a2, a3, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: bnez a2, .LBB1_3 +; RV32I-NEXT: j .LBB1_4 +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a2, .LBB1_4 +; RV32I-NEXT: .LBB1_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB1_4: +; RV32I-NEXT: bltu a2, a3, .LBB1_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB1_6: +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: lshr64_minsize: +; RV64I: # %bb.0: +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: ret + %1 = lshr i64 %a, %b + ret i64 %1 +} + +define i64 @ashr64(i64 %a, i64 %b) nounwind { +; RV32I-LABEL: ashr64: +; RV32I: # %bb.0: +; RV32I-NEXT: li a3, 32 +; RV32I-NEXT: bltu a2, a3, .LBB2_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra a4, a1, a2 +; RV32I-NEXT: bnez a2, .LBB2_3 +; RV32I-NEXT: j .LBB2_4 +; RV32I-NEXT: .LBB2_2: +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a2, .LBB2_4 +; RV32I-NEXT: .LBB2_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB2_4: +; RV32I-NEXT: bltu a2, a3, .LBB2_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB2_6: +; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ashr64: +; RV64I: # %bb.0: +; RV64I-NEXT: sra a0, a0, a1 +; RV64I-NEXT: ret + %1 = ashr i64 %a, %b + ret i64 %1 +} + +define i64 @ashr64_minsize(i64 %a, i64 %b) minsize nounwind { +; RV32I-LABEL: ashr64_minsize: +; RV32I: # %bb.0: +; RV32I-NEXT: li a3, 32 +; RV32I-NEXT: bltu a2, a3, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra a4, a1, a2 +; RV32I-NEXT: bnez a2, .LBB3_3 +; RV32I-NEXT: j .LBB3_4 +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: srl a4, a0, a2 +; RV32I-NEXT: neg a5, a2 +; RV32I-NEXT: sll a5, a1, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a2, .LBB3_4 +; RV32I-NEXT: .LBB3_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB3_4: +; RV32I-NEXT: bltu a2, a3, .LBB3_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB3_6: +; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ashr64_minsize: +; RV64I: # %bb.0: +; RV64I-NEXT: sra a0, a0, a1 +; RV64I-NEXT: ret + %1 = ashr i64 %a, %b + ret i64 %1 +} + +define i64 @shl64(i64 %a, i64 %b) nounwind { +; RV32I-LABEL: shl64: +; RV32I: # %bb.0: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: bltu a2, a0, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: bnez a2, .LBB4_3 +; RV32I-NEXT: j .LBB4_4 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: sll a0, a3, a2 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: srl a3, a3, a4 +; RV32I-NEXT: sll a4, a1, a2 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: beqz a2, .LBB4_4 +; RV32I-NEXT: .LBB4_3: +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: .LBB4_4: +; RV32I-NEXT: ret +; +; RV64I-LABEL: shl64: +; RV64I: # %bb.0: +; RV64I-NEXT: sll a0, a0, a1 +; RV64I-NEXT: ret + %1 = shl i64 %a, %b + ret i64 %1 +} + +define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind { +; RV32I-LABEL: shl64_minsize: +; RV32I: # %bb.0: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: bltu a2, a0, .LBB5_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: bnez a2, .LBB5_3 +; RV32I-NEXT: j .LBB5_4 +; RV32I-NEXT: .LBB5_2: +; RV32I-NEXT: sll a0, a3, a2 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: srl a3, a3, a4 +; RV32I-NEXT: sll a4, a1, a2 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: beqz a2, .LBB5_4 +; RV32I-NEXT: .LBB5_3: +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: .LBB5_4: +; RV32I-NEXT: ret +; +; RV64I-LABEL: shl64_minsize: +; RV64I: # %bb.0: +; RV64I-NEXT: sll a0, a0, a1 +; RV64I-NEXT: ret + %1 = shl i64 %a, %b + ret i64 %1 +} + +define i128 @lshr128(i128 %a, i128 %b) nounwind { +; RV32I-LABEL: lshr128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 0(a2) +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: srl t2, a3, a2 +; RV32I-NEXT: neg t6, a2 +; RV32I-NEXT: sll t5, a7, t6 +; RV32I-NEXT: bltu a2, t0, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a5, a7, a2 +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: bnez a2, .LBB6_3 +; RV32I-NEXT: j .LBB6_4 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: or a5, t2, t5 +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: beqz a2, .LBB6_4 +; RV32I-NEXT: .LBB6_3: +; RV32I-NEXT: mv a4, a5 +; RV32I-NEXT: .LBB6_4: +; RV32I-NEXT: lw a5, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: bltu a2, t0, .LBB6_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: srl t4, a1, a2 +; RV32I-NEXT: j .LBB6_7 +; RV32I-NEXT: .LBB6_6: +; RV32I-NEXT: srl a6, a7, a2 +; RV32I-NEXT: srl t1, a5, a2 +; RV32I-NEXT: sll t3, a1, t6 +; RV32I-NEXT: or t4, t1, t3 +; RV32I-NEXT: .LBB6_7: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a5 +; RV32I-NEXT: beqz a2, .LBB6_9 +; RV32I-NEXT: # %bb.8: +; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: .LBB6_9: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s0, t1, a2 +; RV32I-NEXT: bltu a2, t0, .LBB6_12 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: bgeu s0, t0, .LBB6_13 +; RV32I-NEXT: .LBB6_11: +; RV32I-NEXT: sll t6, a3, t6 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a3, s1 +; RV32I-NEXT: or s2, s1, t5 +; RV32I-NEXT: j .LBB6_14 +; RV32I-NEXT: .LBB6_12: +; RV32I-NEXT: srl t4, a1, a2 +; RV32I-NEXT: bltu s0, t0, .LBB6_11 +; RV32I-NEXT: .LBB6_13: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s2, a3, s0 +; RV32I-NEXT: .LBB6_14: +; RV32I-NEXT: addi s1, a2, -64 +; RV32I-NEXT: mv t5, a7 +; RV32I-NEXT: beqz s0, .LBB6_16 +; RV32I-NEXT: # %bb.15: +; RV32I-NEXT: mv t5, s2 +; RV32I-NEXT: .LBB6_16: +; RV32I-NEXT: bltu s1, t0, .LBB6_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: srl t2, a7, s1 +; RV32I-NEXT: bnez s1, .LBB6_19 +; RV32I-NEXT: j .LBB6_20 +; RV32I-NEXT: .LBB6_18: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, a7, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB6_20 +; RV32I-NEXT: .LBB6_19: +; RV32I-NEXT: mv a3, t2 +; RV32I-NEXT: .LBB6_20: +; RV32I-NEXT: bltu s1, t0, .LBB6_22 +; RV32I-NEXT: # %bb.21: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: bltu a2, t1, .LBB6_23 +; RV32I-NEXT: j .LBB6_24 +; RV32I-NEXT: .LBB6_22: +; RV32I-NEXT: srl a7, a7, a2 +; RV32I-NEXT: bgeu a2, t1, .LBB6_24 +; RV32I-NEXT: .LBB6_23: +; RV32I-NEXT: or a3, t3, t6 +; RV32I-NEXT: or a7, t4, t5 +; RV32I-NEXT: .LBB6_24: +; RV32I-NEXT: bnez a2, .LBB6_28 +; RV32I-NEXT: # %bb.25: +; RV32I-NEXT: bltu a2, t1, .LBB6_27 +; RV32I-NEXT: .LBB6_26: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: .LBB6_27: +; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a6, 12(a0) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB6_28: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: bgeu a2, t1, .LBB6_26 +; RV32I-NEXT: j .LBB6_27 +; +; RV64I-LABEL: lshr128: +; RV64I: # %bb.0: +; RV64I-NEXT: li a3, 64 +; RV64I-NEXT: bltu a2, a3, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a4, a2, a3 +; RV64I-NEXT: srl a4, a1, a4 +; RV64I-NEXT: bnez a2, .LBB6_3 +; RV64I-NEXT: j .LBB6_4 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: srl a4, a0, a2 +; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: sll a5, a1, a5 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: beqz a2, .LBB6_4 +; RV64I-NEXT: .LBB6_3: +; RV64I-NEXT: mv a0, a4 +; RV64I-NEXT: .LBB6_4: +; RV64I-NEXT: bltu a2, a3, .LBB6_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB6_6: +; RV64I-NEXT: srl a1, a1, a2 +; RV64I-NEXT: ret + %1 = lshr i128 %a, %b + ret i128 %1 +} + +define i128 @ashr128(i128 %a, i128 %b) nounwind { +; RV32I-LABEL: ashr128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 0(a2) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a3, 12(a1) +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: srl t2, a4, a2 +; RV32I-NEXT: neg t6, a2 +; RV32I-NEXT: sll t5, a3, t6 +; RV32I-NEXT: bltu a2, t0, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra a6, a3, a2 +; RV32I-NEXT: mv a5, a4 +; RV32I-NEXT: bnez a2, .LBB7_3 +; RV32I-NEXT: j .LBB7_4 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: or a6, t2, t5 +; RV32I-NEXT: mv a5, a4 +; RV32I-NEXT: beqz a2, .LBB7_4 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: mv a5, a6 +; RV32I-NEXT: .LBB7_4: +; RV32I-NEXT: lw a6, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: bltu a2, t0, .LBB7_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: srai a7, a3, 31 +; RV32I-NEXT: srl t4, a1, a2 +; RV32I-NEXT: j .LBB7_7 +; RV32I-NEXT: .LBB7_6: +; RV32I-NEXT: sra a7, a3, a2 +; RV32I-NEXT: srl t1, a6, a2 +; RV32I-NEXT: sll t3, a1, t6 +; RV32I-NEXT: or t4, t1, t3 +; RV32I-NEXT: .LBB7_7: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a6 +; RV32I-NEXT: beqz a2, .LBB7_9 +; RV32I-NEXT: # %bb.8: +; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: .LBB7_9: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s0, t1, a2 +; RV32I-NEXT: bltu a2, t0, .LBB7_12 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: bgeu s0, t0, .LBB7_13 +; RV32I-NEXT: .LBB7_11: +; RV32I-NEXT: sll t6, a4, t6 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a4, s1 +; RV32I-NEXT: or s2, s1, t5 +; RV32I-NEXT: j .LBB7_14 +; RV32I-NEXT: .LBB7_12: +; RV32I-NEXT: srl t4, a1, a2 +; RV32I-NEXT: bltu s0, t0, .LBB7_11 +; RV32I-NEXT: .LBB7_13: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s2, a4, s0 +; RV32I-NEXT: .LBB7_14: +; RV32I-NEXT: addi s1, a2, -64 +; RV32I-NEXT: mv t5, a3 +; RV32I-NEXT: beqz s0, .LBB7_16 +; RV32I-NEXT: # %bb.15: +; RV32I-NEXT: mv t5, s2 +; RV32I-NEXT: .LBB7_16: +; RV32I-NEXT: bltu s1, t0, .LBB7_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: sra t2, a3, s1 +; RV32I-NEXT: bnez s1, .LBB7_19 +; RV32I-NEXT: j .LBB7_20 +; RV32I-NEXT: .LBB7_18: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, a3, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB7_20 +; RV32I-NEXT: .LBB7_19: +; RV32I-NEXT: mv a4, t2 +; RV32I-NEXT: .LBB7_20: +; RV32I-NEXT: bltu s1, t0, .LBB7_22 +; RV32I-NEXT: # %bb.21: +; RV32I-NEXT: srai t0, a3, 31 +; RV32I-NEXT: bltu a2, t1, .LBB7_23 +; RV32I-NEXT: j .LBB7_24 +; RV32I-NEXT: .LBB7_22: +; RV32I-NEXT: sra t0, a3, a2 +; RV32I-NEXT: bgeu a2, t1, .LBB7_24 +; RV32I-NEXT: .LBB7_23: +; RV32I-NEXT: or a4, t3, t6 +; RV32I-NEXT: or t0, t4, t5 +; RV32I-NEXT: .LBB7_24: +; RV32I-NEXT: bnez a2, .LBB7_28 +; RV32I-NEXT: # %bb.25: +; RV32I-NEXT: bltu a2, t1, .LBB7_27 +; RV32I-NEXT: .LBB7_26: +; RV32I-NEXT: srai a5, a3, 31 +; RV32I-NEXT: mv a7, a5 +; RV32I-NEXT: .LBB7_27: +; RV32I-NEXT: sw a6, 0(a0) +; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a5, 8(a0) +; RV32I-NEXT: sw a7, 12(a0) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB7_28: +; RV32I-NEXT: mv a6, a4 +; RV32I-NEXT: mv a1, t0 +; RV32I-NEXT: bgeu a2, t1, .LBB7_26 +; RV32I-NEXT: j .LBB7_27 +; +; RV64I-LABEL: ashr128: +; RV64I: # %bb.0: +; RV64I-NEXT: li a3, 64 +; RV64I-NEXT: bltu a2, a3, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a4, a2, a3 +; RV64I-NEXT: sra a4, a1, a4 +; RV64I-NEXT: bnez a2, .LBB7_3 +; RV64I-NEXT: j .LBB7_4 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: srl a4, a0, a2 +; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: sll a5, a1, a5 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: beqz a2, .LBB7_4 +; RV64I-NEXT: .LBB7_3: +; RV64I-NEXT: mv a0, a4 +; RV64I-NEXT: .LBB7_4: +; RV64I-NEXT: bltu a2, a3, .LBB7_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: srai a1, a1, 63 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB7_6: +; RV64I-NEXT: sra a1, a1, a2 +; RV64I-NEXT: ret + %1 = ashr i128 %a, %b + ret i128 %1 +} + +define i128 @shl128(i128 %a, i128 %b) nounwind { +; RV32I-LABEL: shl128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 0(a2) +; RV32I-NEXT: lw a7, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: li a6, 64 +; RV32I-NEXT: li t1, 32 +; RV32I-NEXT: neg t5, a2 +; RV32I-NEXT: srl t2, a7, t5 +; RV32I-NEXT: sll t0, a3, a2 +; RV32I-NEXT: bltu a2, t1, .LBB8_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: sll t3, a7, a2 +; RV32I-NEXT: j .LBB8_3 +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: sll a4, a7, a2 +; RV32I-NEXT: or t3, t2, t0 +; RV32I-NEXT: .LBB8_3: +; RV32I-NEXT: sub t4, a6, a2 +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: beqz a2, .LBB8_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a5, t3 +; RV32I-NEXT: .LBB8_5: +; RV32I-NEXT: bltu t4, t1, .LBB8_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t2, a3, t4 +; RV32I-NEXT: mv t3, a7 +; RV32I-NEXT: bnez t4, .LBB8_8 +; RV32I-NEXT: j .LBB8_9 +; RV32I-NEXT: .LBB8_7: +; RV32I-NEXT: neg t3, t4 +; RV32I-NEXT: sll t3, a3, t3 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: mv t3, a7 +; RV32I-NEXT: beqz t4, .LBB8_9 +; RV32I-NEXT: .LBB8_8: +; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: .LBB8_9: +; RV32I-NEXT: bltu t4, t1, .LBB8_11 +; RV32I-NEXT: # %bb.10: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: j .LBB8_12 +; RV32I-NEXT: .LBB8_11: +; RV32I-NEXT: srl t4, a3, t5 +; RV32I-NEXT: .LBB8_12: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lw t2, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: bltu a2, t1, .LBB8_14 +; RV32I-NEXT: # %bb.13: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s1, t2, a2 +; RV32I-NEXT: j .LBB8_15 +; RV32I-NEXT: .LBB8_14: +; RV32I-NEXT: sll t6, t2, a2 +; RV32I-NEXT: srl t5, t2, t5 +; RV32I-NEXT: sll s0, a1, a2 +; RV32I-NEXT: or s1, t5, s0 +; RV32I-NEXT: .LBB8_15: +; RV32I-NEXT: addi s0, a2, -64 +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: beqz a2, .LBB8_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t5, s1 +; RV32I-NEXT: .LBB8_17: +; RV32I-NEXT: bltu s0, t1, .LBB8_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: li t1, 0 +; RV32I-NEXT: sll a7, a7, s0 +; RV32I-NEXT: bnez s0, .LBB8_20 +; RV32I-NEXT: j .LBB8_21 +; RV32I-NEXT: .LBB8_19: +; RV32I-NEXT: sll t1, a7, a2 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl a7, a7, s1 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: beqz s0, .LBB8_21 +; RV32I-NEXT: .LBB8_20: +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: .LBB8_21: +; RV32I-NEXT: bltu a2, a6, .LBB8_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: li a5, 0 +; RV32I-NEXT: bnez a2, .LBB8_24 +; RV32I-NEXT: j .LBB8_25 +; RV32I-NEXT: .LBB8_23: +; RV32I-NEXT: or t1, t3, t6 +; RV32I-NEXT: or a3, t4, t5 +; RV32I-NEXT: beqz a2, .LBB8_25 +; RV32I-NEXT: .LBB8_24: +; RV32I-NEXT: mv t2, t1 +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: .LBB8_25: +; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw t2, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: shl128: +; RV64I: # %bb.0: +; RV64I-NEXT: mv a3, a0 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: bltu a2, a4, .LBB8_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: sub a4, a2, a4 +; RV64I-NEXT: sll a3, a3, a4 +; RV64I-NEXT: bnez a2, .LBB8_3 +; RV64I-NEXT: j .LBB8_4 +; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: sll a0, a3, a2 +; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: srl a3, a3, a4 +; RV64I-NEXT: sll a4, a1, a2 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: beqz a2, .LBB8_4 +; RV64I-NEXT: .LBB8_3: +; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: .LBB8_4: +; RV64I-NEXT: ret + %1 = shl i128 %a, %b + ret i128 %1 +} + +define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { +; RV32I-LABEL: fshr64_minsize: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a5, a2, 63 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: bltu a5, a4, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a6, a1, a5 +; RV32I-NEXT: j .LBB9_3 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: srl a3, a0, a2 +; RV32I-NEXT: neg a6, a5 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a6, a3, a6 +; RV32I-NEXT: .LBB9_3: +; RV32I-NEXT: mv a3, a0 +; RV32I-NEXT: beqz a5, .LBB9_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: .LBB9_5: +; RV32I-NEXT: neg a6, a2 +; RV32I-NEXT: bltu a5, a4, .LBB9_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: j .LBB9_8 +; RV32I-NEXT: .LBB9_7: +; RV32I-NEXT: srl a2, a1, a2 +; RV32I-NEXT: .LBB9_8: +; RV32I-NEXT: andi a5, a6, 63 +; RV32I-NEXT: bltu a5, a4, .LBB9_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: sll a0, a0, a5 +; RV32I-NEXT: bnez a5, .LBB9_11 +; RV32I-NEXT: j .LBB9_12 +; RV32I-NEXT: .LBB9_10: +; RV32I-NEXT: sll a4, a0, a6 +; RV32I-NEXT: neg a7, a5 +; RV32I-NEXT: srl a0, a0, a7 +; RV32I-NEXT: sll a6, a1, a6 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: beqz a5, .LBB9_12 +; RV32I-NEXT: .LBB9_11: +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB9_12: +; RV32I-NEXT: or a0, a3, a4 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: fshr64_minsize: +; RV64I: # %bb.0: +; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: srl a1, a0, a1 +; RV64I-NEXT: sll a0, a0, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: ret + %res = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) + ret i64 %res +} + +define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { +; RV32I-LABEL: fshr128_minsize: +; RV32I: # %bb.0: +; RV32I-NEXT: lw t3, 0(a2) +; RV32I-NEXT: lw a2, 8(a1) +; RV32I-NEXT: lw a3, 12(a1) +; RV32I-NEXT: andi t4, t3, 127 +; RV32I-NEXT: li a6, 32 +; RV32I-NEXT: neg t6, t4 +; RV32I-NEXT: sll t5, a3, t6 +; RV32I-NEXT: bltu t4, a6, .LBB10_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a5, a3, t4 +; RV32I-NEXT: j .LBB10_3 +; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: srl a4, a2, t3 +; RV32I-NEXT: or a5, a4, t5 +; RV32I-NEXT: .LBB10_3: +; RV32I-NEXT: mv a4, a2 +; RV32I-NEXT: beqz t4, .LBB10_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a4, a5 +; RV32I-NEXT: .LBB10_5: +; RV32I-NEXT: lw a7, 0(a1) +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: bltu t4, a6, .LBB10_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: srl t2, a5, t4 +; RV32I-NEXT: j .LBB10_8 +; RV32I-NEXT: .LBB10_7: +; RV32I-NEXT: srl a1, a3, t3 +; RV32I-NEXT: srl t0, a7, t3 +; RV32I-NEXT: sll t1, a5, t6 +; RV32I-NEXT: or t2, t0, t1 +; RV32I-NEXT: .LBB10_8: +; RV32I-NEXT: li t0, 64 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz t4, .LBB10_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: .LBB10_10: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s0, t0, t4 +; RV32I-NEXT: bltu t4, a6, .LBB10_13 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t2, 0 +; RV32I-NEXT: bgeu s0, a6, .LBB10_14 +; RV32I-NEXT: .LBB10_12: +; RV32I-NEXT: sll t6, a2, t6 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a2, s1 +; RV32I-NEXT: or s2, s1, t5 +; RV32I-NEXT: j .LBB10_15 +; RV32I-NEXT: .LBB10_13: +; RV32I-NEXT: srl t2, a5, t3 +; RV32I-NEXT: bltu s0, a6, .LBB10_12 +; RV32I-NEXT: .LBB10_14: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s2, a2, s0 +; RV32I-NEXT: .LBB10_15: +; RV32I-NEXT: addi s1, t4, -64 +; RV32I-NEXT: mv t5, a3 +; RV32I-NEXT: beqz s0, .LBB10_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t5, s2 +; RV32I-NEXT: .LBB10_17: +; RV32I-NEXT: bltu s1, a6, .LBB10_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: srl s2, a3, s1 +; RV32I-NEXT: j .LBB10_20 +; RV32I-NEXT: .LBB10_19: +; RV32I-NEXT: srl s0, a2, t4 +; RV32I-NEXT: neg s2, s1 +; RV32I-NEXT: sll s2, a3, s2 +; RV32I-NEXT: or s2, s0, s2 +; RV32I-NEXT: .LBB10_20: +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: beqz s1, .LBB10_22 +; RV32I-NEXT: # %bb.21: +; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: .LBB10_22: +; RV32I-NEXT: bltu s1, a6, .LBB10_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bltu t4, t0, .LBB10_25 +; RV32I-NEXT: j .LBB10_26 +; RV32I-NEXT: .LBB10_24: +; RV32I-NEXT: srl s1, a3, t4 +; RV32I-NEXT: bgeu t4, t0, .LBB10_26 +; RV32I-NEXT: .LBB10_25: +; RV32I-NEXT: or s0, t1, t6 +; RV32I-NEXT: or s1, t2, t5 +; RV32I-NEXT: .LBB10_26: +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: mv t2, a5 +; RV32I-NEXT: beqz t4, .LBB10_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: mv t1, s0 +; RV32I-NEXT: mv t2, s1 +; RV32I-NEXT: .LBB10_28: +; RV32I-NEXT: neg t6, t3 +; RV32I-NEXT: bltu t4, t0, .LBB10_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: li a4, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: .LBB10_30: +; RV32I-NEXT: andi t3, t6, 127 +; RV32I-NEXT: neg s2, t3 +; RV32I-NEXT: srl s0, a7, s2 +; RV32I-NEXT: bltu t3, a6, .LBB10_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: sll s3, a7, t3 +; RV32I-NEXT: j .LBB10_33 +; RV32I-NEXT: .LBB10_32: +; RV32I-NEXT: sll t4, a7, t6 +; RV32I-NEXT: sll t5, a5, t6 +; RV32I-NEXT: or s3, s0, t5 +; RV32I-NEXT: .LBB10_33: +; RV32I-NEXT: sub s1, t0, t3 +; RV32I-NEXT: mv t5, a5 +; RV32I-NEXT: beqz t3, .LBB10_35 +; RV32I-NEXT: # %bb.34: +; RV32I-NEXT: mv t5, s3 +; RV32I-NEXT: .LBB10_35: +; RV32I-NEXT: bltu s1, a6, .LBB10_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: srl s3, a5, s1 +; RV32I-NEXT: j .LBB10_38 +; RV32I-NEXT: .LBB10_37: +; RV32I-NEXT: neg s3, s1 +; RV32I-NEXT: sll s3, a5, s3 +; RV32I-NEXT: or s3, s0, s3 +; RV32I-NEXT: .LBB10_38: +; RV32I-NEXT: mv s0, a7 +; RV32I-NEXT: beqz s1, .LBB10_40 +; RV32I-NEXT: # %bb.39: +; RV32I-NEXT: mv s0, s3 +; RV32I-NEXT: .LBB10_40: +; RV32I-NEXT: bltu s1, a6, .LBB10_43 +; RV32I-NEXT: # %bb.41: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu t3, a6, .LBB10_44 +; RV32I-NEXT: .LBB10_42: +; RV32I-NEXT: sll s3, a2, t6 +; RV32I-NEXT: srl s2, a2, s2 +; RV32I-NEXT: sll t6, a3, t6 +; RV32I-NEXT: or s4, s2, t6 +; RV32I-NEXT: j .LBB10_45 +; RV32I-NEXT: .LBB10_43: +; RV32I-NEXT: srl s1, a5, s2 +; RV32I-NEXT: bltu t3, a6, .LBB10_42 +; RV32I-NEXT: .LBB10_44: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll s4, a2, t3 +; RV32I-NEXT: .LBB10_45: +; RV32I-NEXT: addi s2, t3, -64 +; RV32I-NEXT: mv t6, a3 +; RV32I-NEXT: beqz t3, .LBB10_47 +; RV32I-NEXT: # %bb.46: +; RV32I-NEXT: mv t6, s4 +; RV32I-NEXT: .LBB10_47: +; RV32I-NEXT: bltu s2, a6, .LBB10_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll a7, a7, s2 +; RV32I-NEXT: bnez s2, .LBB10_50 +; RV32I-NEXT: j .LBB10_51 +; RV32I-NEXT: .LBB10_49: +; RV32I-NEXT: sll a6, a7, t3 +; RV32I-NEXT: neg s4, s2 +; RV32I-NEXT: srl a7, a7, s4 +; RV32I-NEXT: sll s4, a5, t3 +; RV32I-NEXT: or a7, a7, s4 +; RV32I-NEXT: beqz s2, .LBB10_51 +; RV32I-NEXT: .LBB10_50: +; RV32I-NEXT: mv a5, a7 +; RV32I-NEXT: .LBB10_51: +; RV32I-NEXT: bltu t3, t0, .LBB10_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: bnez t3, .LBB10_54 +; RV32I-NEXT: j .LBB10_55 +; RV32I-NEXT: .LBB10_53: +; RV32I-NEXT: or a6, s0, s3 +; RV32I-NEXT: or a5, s1, t6 +; RV32I-NEXT: beqz t3, .LBB10_55 +; RV32I-NEXT: .LBB10_54: +; RV32I-NEXT: mv a2, a6 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: .LBB10_55: +; RV32I-NEXT: or a5, t1, t4 +; RV32I-NEXT: or a6, t2, t5 +; RV32I-NEXT: or a2, a4, a2 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sw a6, 4(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: ret +; +; RV64I-LABEL: fshr128_minsize: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a5, a2, 127 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: bltu a5, a4, .LBB10_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a3, a5, a4 +; RV64I-NEXT: srl a6, a1, a3 +; RV64I-NEXT: j .LBB10_3 +; RV64I-NEXT: .LBB10_2: +; RV64I-NEXT: srl a3, a0, a2 +; RV64I-NEXT: negw a6, a5 +; RV64I-NEXT: sll a6, a1, a6 +; RV64I-NEXT: or a6, a3, a6 +; RV64I-NEXT: .LBB10_3: +; RV64I-NEXT: mv a3, a0 +; RV64I-NEXT: beqz a5, .LBB10_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a3, a6 +; RV64I-NEXT: .LBB10_5: +; RV64I-NEXT: neg a7, a2 +; RV64I-NEXT: bltu a5, a4, .LBB10_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: li a2, 0 +; RV64I-NEXT: j .LBB10_8 +; RV64I-NEXT: .LBB10_7: +; RV64I-NEXT: srl a2, a1, a2 +; RV64I-NEXT: .LBB10_8: +; RV64I-NEXT: andi a6, a7, 127 +; RV64I-NEXT: bltu a6, a4, .LBB10_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: li a5, 0 +; RV64I-NEXT: sub a4, a6, a4 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: bnez a6, .LBB10_11 +; RV64I-NEXT: j .LBB10_12 +; RV64I-NEXT: .LBB10_10: +; RV64I-NEXT: sll a5, a0, a7 +; RV64I-NEXT: negw a4, a6 +; RV64I-NEXT: srl a0, a0, a4 +; RV64I-NEXT: sll a4, a1, a7 +; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: beqz a6, .LBB10_12 +; RV64I-NEXT: .LBB10_11: +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: .LBB10_12: +; RV64I-NEXT: or a0, a3, a5 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: ret + %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b) + ret i128 %res +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll new file mode 100644 index 0000000000000..4ede693242898 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -0,0 +1,10982 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV64I +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -global-isel < %s | FileCheck %s -check-prefixes=RV32I + +define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_4bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: srlw a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 16 +; RV64I-NEXT: slli a3, a0, 48 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: srli a3, a3, 48 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a4, 3(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_4bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: srli a3, a3, 16 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: ret + %src = load i32, ptr %src.ptr, align 1 + %byteOff = load i32, ptr %byteOff.ptr, align 1 + %bitOff = shl i32 %byteOff, 3 + %res = lshr i32 %src, %bitOff + store i32 %res, ptr %dst, align 1 + ret void +} +define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_4bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: sllw a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 16 +; RV64I-NEXT: slli a3, a0, 48 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: srli a3, a3, 48 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a4, 3(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_4bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: srli a3, a3, 16 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: ret + %src = load i32, ptr %src.ptr, align 1 + %byteOff = load i32, ptr %byteOff.ptr, align 1 + %bitOff = shl i32 %byteOff, 3 + %res = shl i32 %src, %bitOff + store i32 %res, ptr %dst, align 1 + ret void +} +define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_4bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: sraw a0, a0, a1 +; RV64I-NEXT: srliw a1, a0, 16 +; RV64I-NEXT: slli a3, a0, 48 +; RV64I-NEXT: srliw a4, a0, 24 +; RV64I-NEXT: srli a3, a3, 48 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: sb a4, 3(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_4bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: slli a3, a0, 16 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: srli a3, a3, 16 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: ret + %src = load i32, ptr %src.ptr, align 1 + %byteOff = load i32, ptr %byteOff.ptr, align 1 + %bitOff = shl i32 %byteOff, 3 + %res = ashr i32 %src, %bitOff + store i32 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_8bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or t2, t3, t2 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lui a4, 16 +; RV64I-NEXT: addi a4, a4, -1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t2, a5 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: srliw a3, a0, 16 +; RV64I-NEXT: and a5, a0, a4 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: and a4, a1, a4 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_8bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t1, a0, t1 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: slli a0, a4, 16 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: slli a3, t1, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: bltu a1, a4, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a5, a3, a1 +; RV32I-NEXT: bnez a1, .LBB3_3 +; RV32I-NEXT: j .LBB3_4 +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: srl a5, a0, a1 +; RV32I-NEXT: neg a6, a1 +; RV32I-NEXT: sll a6, a3, a6 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: beqz a1, .LBB3_4 +; RV32I-NEXT: .LBB3_3: +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: .LBB3_4: +; RV32I-NEXT: bltu a1, a4, .LBB3_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB3_7 +; RV32I-NEXT: .LBB3_6: +; RV32I-NEXT: srl a1, a3, a1 +; RV32I-NEXT: .LBB3_7: +; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: lui a4, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: srli a6, a1, 16 +; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: and t0, a0, a4 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: ret + %src = load i64, ptr %src.ptr, align 1 + %byteOff = load i64, ptr %byteOff.ptr, align 1 + %bitOff = shl i64 %byteOff, 3 + %res = lshr i64 %src, %bitOff + store i64 %res, ptr %dst, align 1 + ret void +} +define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_8bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or t2, t3, t2 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lui a4, 16 +; RV64I-NEXT: addi a4, a4, -1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t2, a5 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: sll a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: srliw a3, a0, 16 +; RV64I-NEXT: and a5, a0, a4 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: and a4, a1, a4 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_8bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a0, a0, t1 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a3, a1, a6 +; RV32I-NEXT: slli a3, a3, 3 +; RV32I-NEXT: li a1, 32 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: bltu a3, a1, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sll a4, a4, a3 +; RV32I-NEXT: bnez a3, .LBB4_3 +; RV32I-NEXT: j .LBB4_4 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: sll a1, a4, a3 +; RV32I-NEXT: neg a5, a3 +; RV32I-NEXT: srl a4, a4, a5 +; RV32I-NEXT: sll a5, a0, a3 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: beqz a3, .LBB4_4 +; RV32I-NEXT: .LBB4_3: +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB4_4: +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: lui a4, 16 +; RV32I-NEXT: srli a5, a1, 24 +; RV32I-NEXT: srli a6, a0, 16 +; RV32I-NEXT: srli a7, a0, 24 +; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: and t0, a1, a4 +; RV32I-NEXT: and a4, a0, a4 +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: ret + %src = load i64, ptr %src.ptr, align 1 + %byteOff = load i64, ptr %byteOff.ptr, align 1 + %bitOff = shl i64 %byteOff, 3 + %res = shl i64 %src, %bitOff + store i64 %res, ptr %dst, align 1 + ret void +} +define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_8bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or t2, t3, t2 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lui a4, 16 +; RV64I-NEXT: addi a4, a4, -1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t2, a5 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: sra a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: srliw a3, a0, 16 +; RV64I-NEXT: and a5, a0, a4 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: and a4, a1, a4 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_8bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t1, a0, t1 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: slli a0, a4, 16 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: slli a3, t1, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: li a4, 32 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: bltu a1, a4, .LBB5_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra a5, a3, a1 +; RV32I-NEXT: bnez a1, .LBB5_3 +; RV32I-NEXT: j .LBB5_4 +; RV32I-NEXT: .LBB5_2: +; RV32I-NEXT: srl a5, a0, a1 +; RV32I-NEXT: neg a6, a1 +; RV32I-NEXT: sll a6, a3, a6 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: beqz a1, .LBB5_4 +; RV32I-NEXT: .LBB5_3: +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: .LBB5_4: +; RV32I-NEXT: bltu a1, a4, .LBB5_6 +; RV32I-NEXT: # %bb.5: +; RV32I-NEXT: srai a1, a3, 31 +; RV32I-NEXT: j .LBB5_7 +; RV32I-NEXT: .LBB5_6: +; RV32I-NEXT: sra a1, a3, a1 +; RV32I-NEXT: .LBB5_7: +; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: lui a4, 16 +; RV32I-NEXT: srli a5, a0, 24 +; RV32I-NEXT: srli a6, a1, 16 +; RV32I-NEXT: srli a7, a1, 24 +; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: and t0, a0, a4 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb t0, 1(a2) +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb a6, 6(a2) +; RV32I-NEXT: sb a7, 7(a2) +; RV32I-NEXT: ret + %src = load i64, ptr %src.ptr, align 1 + %byteOff = load i64, ptr %byteOff.ptr, align 1 + %bitOff = shl i64 %byteOff, 3 + %res = ashr i64 %src, %bitOff + store i64 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_16bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: or a3, a6, a7 +; RV64I-NEXT: bltu a1, a4, .LBB6_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: srl a5, a3, a5 +; RV64I-NEXT: bnez a1, .LBB6_3 +; RV64I-NEXT: j .LBB6_4 +; RV64I-NEXT: .LBB6_2: +; RV64I-NEXT: srl a5, a0, a1 +; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: beqz a1, .LBB6_4 +; RV64I-NEXT: .LBB6_3: +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB6_4: +; RV64I-NEXT: bltu a1, a4, .LBB6_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: j .LBB6_7 +; RV64I-NEXT: .LBB6_6: +; RV64I-NEXT: srl a1, a3, a1 +; RV64I-NEXT: .LBB6_7: +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: srliw a4, a0, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli t1, a1, 32 +; RV64I-NEXT: srliw t2, a1, 16 +; RV64I-NEXT: srliw t3, a1, 24 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a0, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a0, a3, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a0, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a0, a1, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_16bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t3, a7, a6 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, a7, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t2, a0, t2 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or s1, a7, a6 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: li a7, 32 +; RV32I-NEXT: slli a1, a5, 8 +; RV32I-NEXT: slli a0, t0, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or a6, t5, t4 +; RV32I-NEXT: or t0, t2, t6 +; RV32I-NEXT: or a5, s0, s1 +; RV32I-NEXT: slli a5, a5, 3 +; RV32I-NEXT: srl t2, a6, a5 +; RV32I-NEXT: neg t5, a5 +; RV32I-NEXT: sll t4, t0, t5 +; RV32I-NEXT: bltu a5, a7, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl t6, t0, a5 +; RV32I-NEXT: j .LBB6_3 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: or t6, t2, t4 +; RV32I-NEXT: .LBB6_3: +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: or a3, a0, a4 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: beqz a5, .LBB6_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a0, t6 +; RV32I-NEXT: .LBB6_5: +; RV32I-NEXT: or a4, t3, a1 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: bltu a5, a7, .LBB6_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: srl t6, a3, a5 +; RV32I-NEXT: j .LBB6_8 +; RV32I-NEXT: .LBB6_7: +; RV32I-NEXT: srl a1, t0, a5 +; RV32I-NEXT: srl t1, a4, a5 +; RV32I-NEXT: sll t3, a3, t5 +; RV32I-NEXT: or t6, t1, t3 +; RV32I-NEXT: .LBB6_8: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz a5, .LBB6_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t3, t6 +; RV32I-NEXT: .LBB6_10: +; RV32I-NEXT: sub s0, t1, a5 +; RV32I-NEXT: bltu a5, a7, .LBB6_13 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: bgeu s0, a7, .LBB6_14 +; RV32I-NEXT: .LBB6_12: +; RV32I-NEXT: sll t5, a6, t5 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a6, s1 +; RV32I-NEXT: or s2, s1, t4 +; RV32I-NEXT: j .LBB6_15 +; RV32I-NEXT: .LBB6_13: +; RV32I-NEXT: srl t6, a3, a5 +; RV32I-NEXT: bltu s0, a7, .LBB6_12 +; RV32I-NEXT: .LBB6_14: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: sll s2, a6, s0 +; RV32I-NEXT: .LBB6_15: +; RV32I-NEXT: addi s1, a5, -64 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beqz s0, .LBB6_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t4, s2 +; RV32I-NEXT: .LBB6_17: +; RV32I-NEXT: bltu s1, a7, .LBB6_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: srl t2, t0, s1 +; RV32I-NEXT: bnez s1, .LBB6_20 +; RV32I-NEXT: j .LBB6_21 +; RV32I-NEXT: .LBB6_19: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, t0, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB6_21 +; RV32I-NEXT: .LBB6_20: +; RV32I-NEXT: mv a6, t2 +; RV32I-NEXT: .LBB6_21: +; RV32I-NEXT: bltu s1, a7, .LBB6_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: bltu a5, t1, .LBB6_24 +; RV32I-NEXT: j .LBB6_25 +; RV32I-NEXT: .LBB6_23: +; RV32I-NEXT: srl a7, t0, a5 +; RV32I-NEXT: bgeu a5, t1, .LBB6_25 +; RV32I-NEXT: .LBB6_24: +; RV32I-NEXT: or a6, t3, t5 +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: .LBB6_25: +; RV32I-NEXT: bnez a5, .LBB6_29 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: bltu a5, t1, .LBB6_28 +; RV32I-NEXT: .LBB6_27: +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: .LBB6_28: +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a0, 16 +; RV32I-NEXT: srli t3, a0, 24 +; RV32I-NEXT: srli t4, a1, 16 +; RV32I-NEXT: srli t5, a1, 24 +; RV32I-NEXT: addi a6, a6, -1 +; RV32I-NEXT: and t6, a4, a6 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, a3, a6 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a3, a0, a6 +; RV32I-NEXT: and a4, a1, a6 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB6_29: +; RV32I-NEXT: mv a4, a6 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: bgeu a5, t1, .LBB6_27 +; RV32I-NEXT: j .LBB6_28 + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = lshr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: or a3, a6, a7 +; RV64I-NEXT: bltu a1, a4, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: srl a5, a3, a5 +; RV64I-NEXT: bnez a1, .LBB7_3 +; RV64I-NEXT: j .LBB7_4 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: srl a5, a0, a1 +; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: beqz a1, .LBB7_4 +; RV64I-NEXT: .LBB7_3: +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB7_4: +; RV64I-NEXT: bltu a1, a4, .LBB7_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: j .LBB7_7 +; RV64I-NEXT: .LBB7_6: +; RV64I-NEXT: srl a1, a3, a1 +; RV64I-NEXT: .LBB7_7: +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: srliw a4, a0, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli t1, a1, 32 +; RV64I-NEXT: srliw t2, a1, 16 +; RV64I-NEXT: srliw t3, a1, 24 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a0, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a0, a3, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a0, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a0, a1, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t3, a7, a6 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t6, a7, a6 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t2, a0, t2 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or s1, a7, a6 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: li a7, 32 +; RV32I-NEXT: slli a1, a5, 8 +; RV32I-NEXT: slli a0, t0, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or a6, t5, t4 +; RV32I-NEXT: or t0, t2, t6 +; RV32I-NEXT: or a5, s0, s1 +; RV32I-NEXT: slli a5, a5, 5 +; RV32I-NEXT: srl t2, a6, a5 +; RV32I-NEXT: neg t5, a5 +; RV32I-NEXT: sll t4, t0, t5 +; RV32I-NEXT: bltu a5, a7, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl t6, t0, a5 +; RV32I-NEXT: j .LBB7_3 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: or t6, t2, t4 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: or a3, a0, a4 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: beqz a5, .LBB7_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a0, t6 +; RV32I-NEXT: .LBB7_5: +; RV32I-NEXT: or a4, t3, a1 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: bltu a5, a7, .LBB7_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: srl t6, a3, a5 +; RV32I-NEXT: j .LBB7_8 +; RV32I-NEXT: .LBB7_7: +; RV32I-NEXT: srl a1, t0, a5 +; RV32I-NEXT: srl t1, a4, a5 +; RV32I-NEXT: sll t3, a3, t5 +; RV32I-NEXT: or t6, t1, t3 +; RV32I-NEXT: .LBB7_8: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz a5, .LBB7_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t3, t6 +; RV32I-NEXT: .LBB7_10: +; RV32I-NEXT: sub s0, t1, a5 +; RV32I-NEXT: bltu a5, a7, .LBB7_13 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: bgeu s0, a7, .LBB7_14 +; RV32I-NEXT: .LBB7_12: +; RV32I-NEXT: sll t5, a6, t5 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a6, s1 +; RV32I-NEXT: or s2, s1, t4 +; RV32I-NEXT: j .LBB7_15 +; RV32I-NEXT: .LBB7_13: +; RV32I-NEXT: srl t6, a3, a5 +; RV32I-NEXT: bltu s0, a7, .LBB7_12 +; RV32I-NEXT: .LBB7_14: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: sll s2, a6, s0 +; RV32I-NEXT: .LBB7_15: +; RV32I-NEXT: addi s1, a5, -64 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beqz s0, .LBB7_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t4, s2 +; RV32I-NEXT: .LBB7_17: +; RV32I-NEXT: bltu s1, a7, .LBB7_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: srl t2, t0, s1 +; RV32I-NEXT: bnez s1, .LBB7_20 +; RV32I-NEXT: j .LBB7_21 +; RV32I-NEXT: .LBB7_19: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, t0, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB7_21 +; RV32I-NEXT: .LBB7_20: +; RV32I-NEXT: mv a6, t2 +; RV32I-NEXT: .LBB7_21: +; RV32I-NEXT: bltu s1, a7, .LBB7_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: bltu a5, t1, .LBB7_24 +; RV32I-NEXT: j .LBB7_25 +; RV32I-NEXT: .LBB7_23: +; RV32I-NEXT: srl a7, t0, a5 +; RV32I-NEXT: bgeu a5, t1, .LBB7_25 +; RV32I-NEXT: .LBB7_24: +; RV32I-NEXT: or a6, t3, t5 +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: .LBB7_25: +; RV32I-NEXT: bnez a5, .LBB7_29 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: bltu a5, t1, .LBB7_28 +; RV32I-NEXT: .LBB7_27: +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: .LBB7_28: +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a0, 16 +; RV32I-NEXT: srli t3, a0, 24 +; RV32I-NEXT: srli t4, a1, 16 +; RV32I-NEXT: srli t5, a1, 24 +; RV32I-NEXT: addi a6, a6, -1 +; RV32I-NEXT: and t6, a4, a6 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, a3, a6 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a3, a0, a6 +; RV32I-NEXT: and a4, a1, a6 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB7_29: +; RV32I-NEXT: mv a4, a6 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: bgeu a5, t1, .LBB7_27 +; RV32I-NEXT: j .LBB7_28 + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = lshr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_16bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a4, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a3, a1, 3 +; RV64I-NEXT: li a5, 64 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: bltu a3, a5, .LBB8_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: sub a5, a3, a5 +; RV64I-NEXT: sll a4, a4, a5 +; RV64I-NEXT: bnez a3, .LBB8_3 +; RV64I-NEXT: j .LBB8_4 +; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: sll a1, a4, a3 +; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: srl a4, a4, a5 +; RV64I-NEXT: sll a5, a0, a3 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: beqz a3, .LBB8_4 +; RV64I-NEXT: .LBB8_3: +; RV64I-NEXT: mv a0, a4 +; RV64I-NEXT: .LBB8_4: +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: srliw a4, a1, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a1, 24 +; RV64I-NEXT: srli a7, a1, 48 +; RV64I-NEXT: srli t0, a1, 56 +; RV64I-NEXT: srli t1, a0, 32 +; RV64I-NEXT: srliw t2, a0, 16 +; RV64I-NEXT: srliw t3, a0, 24 +; RV64I-NEXT: srli t4, a0, 48 +; RV64I-NEXT: srli t5, a0, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a1, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a1, a3, a5 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a1, a0, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_16bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: li a6, 64 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: li t1, 32 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or t0, a4, a3 +; RV32I-NEXT: or a4, t2, a5 +; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: slli a5, a5, 3 +; RV32I-NEXT: neg t3, a5 +; RV32I-NEXT: srl t4, t0, t3 +; RV32I-NEXT: sll t2, a4, a5 +; RV32I-NEXT: bltu a5, t1, .LBB8_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sll t5, t0, a5 +; RV32I-NEXT: j .LBB8_3 +; RV32I-NEXT: .LBB8_2: +; RV32I-NEXT: sll a1, t0, a5 +; RV32I-NEXT: or t5, t4, t2 +; RV32I-NEXT: .LBB8_3: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s1, 15(a0) +; RV32I-NEXT: sub a7, a6, a5 +; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: beqz a5, .LBB8_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, t5 +; RV32I-NEXT: .LBB8_5: +; RV32I-NEXT: lbu s2, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu s0, 13(a0) +; RV32I-NEXT: lbu t6, 14(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: bltu a7, t1, .LBB8_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl s4, a4, a7 +; RV32I-NEXT: j .LBB8_8 +; RV32I-NEXT: .LBB8_7: +; RV32I-NEXT: neg s4, a7 +; RV32I-NEXT: sll s4, a4, s4 +; RV32I-NEXT: or s4, t4, s4 +; RV32I-NEXT: .LBB8_8: +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: lbu s6, 8(a0) +; RV32I-NEXT: lbu s5, 12(a0) +; RV32I-NEXT: or s3, s3, t5 +; RV32I-NEXT: slli t5, s0, 8 +; RV32I-NEXT: or s1, s1, t6 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beqz a7, .LBB8_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t4, s4 +; RV32I-NEXT: .LBB8_10: +; RV32I-NEXT: or a0, s2, s6 +; RV32I-NEXT: slli s0, s3, 16 +; RV32I-NEXT: or t6, t5, s5 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: bltu a7, t1, .LBB8_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: j .LBB8_13 +; RV32I-NEXT: .LBB8_12: +; RV32I-NEXT: srl t5, a4, t3 +; RV32I-NEXT: .LBB8_13: +; RV32I-NEXT: or a7, s0, a0 +; RV32I-NEXT: or a0, s1, t6 +; RV32I-NEXT: bltu a5, t1, .LBB8_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s1, a7, a5 +; RV32I-NEXT: j .LBB8_16 +; RV32I-NEXT: .LBB8_15: +; RV32I-NEXT: sll t6, a7, a5 +; RV32I-NEXT: srl t3, a7, t3 +; RV32I-NEXT: sll s0, a0, a5 +; RV32I-NEXT: or s1, t3, s0 +; RV32I-NEXT: .LBB8_16: +; RV32I-NEXT: addi s0, a5, -64 +; RV32I-NEXT: mv t3, a0 +; RV32I-NEXT: beqz a5, .LBB8_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t3, s1 +; RV32I-NEXT: .LBB8_18: +; RV32I-NEXT: bltu s0, t1, .LBB8_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: li t1, 0 +; RV32I-NEXT: sll t0, t0, s0 +; RV32I-NEXT: bnez s0, .LBB8_21 +; RV32I-NEXT: j .LBB8_22 +; RV32I-NEXT: .LBB8_20: +; RV32I-NEXT: sll t1, t0, a5 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl t0, t0, s1 +; RV32I-NEXT: or t0, t0, t2 +; RV32I-NEXT: beqz s0, .LBB8_22 +; RV32I-NEXT: .LBB8_21: +; RV32I-NEXT: mv a4, t0 +; RV32I-NEXT: .LBB8_22: +; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: bltu a5, a6, .LBB8_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: bnez a5, .LBB8_25 +; RV32I-NEXT: j .LBB8_26 +; RV32I-NEXT: .LBB8_24: +; RV32I-NEXT: or t1, t4, t6 +; RV32I-NEXT: or a4, t5, t3 +; RV32I-NEXT: beqz a5, .LBB8_26 +; RV32I-NEXT: .LBB8_25: +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB8_26: +; RV32I-NEXT: srli a4, a1, 16 +; RV32I-NEXT: lui a5, 16 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli t3, a7, 24 +; RV32I-NEXT: srli t4, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: addi a5, a5, -1 +; RV32I-NEXT: and t6, a1, a5 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: and a1, a3, a5 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a1, a7, a5 +; RV32I-NEXT: and a5, a0, a5 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb a5, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: ret + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = shl i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a4, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a3, a1, 5 +; RV64I-NEXT: li a5, 64 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: bltu a3, a5, .LBB9_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: sub a5, a3, a5 +; RV64I-NEXT: sll a4, a4, a5 +; RV64I-NEXT: bnez a3, .LBB9_3 +; RV64I-NEXT: j .LBB9_4 +; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: sll a1, a4, a3 +; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: srl a4, a4, a5 +; RV64I-NEXT: sll a5, a0, a3 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: beqz a3, .LBB9_4 +; RV64I-NEXT: .LBB9_3: +; RV64I-NEXT: mv a0, a4 +; RV64I-NEXT: .LBB9_4: +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: srliw a4, a1, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a1, 24 +; RV64I-NEXT: srli a7, a1, 48 +; RV64I-NEXT: srli t0, a1, 56 +; RV64I-NEXT: srli t1, a0, 32 +; RV64I-NEXT: srliw t2, a0, 16 +; RV64I-NEXT: srliw t3, a0, 24 +; RV64I-NEXT: srli t4, a0, 48 +; RV64I-NEXT: srli t5, a0, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a1, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a1, a3, a5 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a1, a0, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: li a6, 64 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: li t1, 32 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or t0, a4, a3 +; RV32I-NEXT: or a4, t2, a5 +; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: slli a5, a5, 5 +; RV32I-NEXT: neg t3, a5 +; RV32I-NEXT: srl t4, t0, t3 +; RV32I-NEXT: sll t2, a4, a5 +; RV32I-NEXT: bltu a5, t1, .LBB9_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sll t5, t0, a5 +; RV32I-NEXT: j .LBB9_3 +; RV32I-NEXT: .LBB9_2: +; RV32I-NEXT: sll a1, t0, a5 +; RV32I-NEXT: or t5, t4, t2 +; RV32I-NEXT: .LBB9_3: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s1, 15(a0) +; RV32I-NEXT: sub a7, a6, a5 +; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: beqz a5, .LBB9_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, t5 +; RV32I-NEXT: .LBB9_5: +; RV32I-NEXT: lbu s2, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu s0, 13(a0) +; RV32I-NEXT: lbu t6, 14(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: bltu a7, t1, .LBB9_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl s4, a4, a7 +; RV32I-NEXT: j .LBB9_8 +; RV32I-NEXT: .LBB9_7: +; RV32I-NEXT: neg s4, a7 +; RV32I-NEXT: sll s4, a4, s4 +; RV32I-NEXT: or s4, t4, s4 +; RV32I-NEXT: .LBB9_8: +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: lbu s6, 8(a0) +; RV32I-NEXT: lbu s5, 12(a0) +; RV32I-NEXT: or s3, s3, t5 +; RV32I-NEXT: slli t5, s0, 8 +; RV32I-NEXT: or s1, s1, t6 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beqz a7, .LBB9_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t4, s4 +; RV32I-NEXT: .LBB9_10: +; RV32I-NEXT: or a0, s2, s6 +; RV32I-NEXT: slli s0, s3, 16 +; RV32I-NEXT: or t6, t5, s5 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: bltu a7, t1, .LBB9_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: j .LBB9_13 +; RV32I-NEXT: .LBB9_12: +; RV32I-NEXT: srl t5, a4, t3 +; RV32I-NEXT: .LBB9_13: +; RV32I-NEXT: or a7, s0, a0 +; RV32I-NEXT: or a0, s1, t6 +; RV32I-NEXT: bltu a5, t1, .LBB9_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s1, a7, a5 +; RV32I-NEXT: j .LBB9_16 +; RV32I-NEXT: .LBB9_15: +; RV32I-NEXT: sll t6, a7, a5 +; RV32I-NEXT: srl t3, a7, t3 +; RV32I-NEXT: sll s0, a0, a5 +; RV32I-NEXT: or s1, t3, s0 +; RV32I-NEXT: .LBB9_16: +; RV32I-NEXT: addi s0, a5, -64 +; RV32I-NEXT: mv t3, a0 +; RV32I-NEXT: beqz a5, .LBB9_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t3, s1 +; RV32I-NEXT: .LBB9_18: +; RV32I-NEXT: bltu s0, t1, .LBB9_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: li t1, 0 +; RV32I-NEXT: sll t0, t0, s0 +; RV32I-NEXT: bnez s0, .LBB9_21 +; RV32I-NEXT: j .LBB9_22 +; RV32I-NEXT: .LBB9_20: +; RV32I-NEXT: sll t1, t0, a5 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl t0, t0, s1 +; RV32I-NEXT: or t0, t0, t2 +; RV32I-NEXT: beqz s0, .LBB9_22 +; RV32I-NEXT: .LBB9_21: +; RV32I-NEXT: mv a4, t0 +; RV32I-NEXT: .LBB9_22: +; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: bltu a5, a6, .LBB9_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: bnez a5, .LBB9_25 +; RV32I-NEXT: j .LBB9_26 +; RV32I-NEXT: .LBB9_24: +; RV32I-NEXT: or t1, t4, t6 +; RV32I-NEXT: or a4, t5, t3 +; RV32I-NEXT: beqz a5, .LBB9_26 +; RV32I-NEXT: .LBB9_25: +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: .LBB9_26: +; RV32I-NEXT: srli a4, a1, 16 +; RV32I-NEXT: lui a5, 16 +; RV32I-NEXT: srli a6, a1, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a7, 16 +; RV32I-NEXT: srli t3, a7, 24 +; RV32I-NEXT: srli t4, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: addi a5, a5, -1 +; RV32I-NEXT: and t6, a1, a5 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: and a1, a3, a5 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a1, a7, a5 +; RV32I-NEXT: and a5, a0, a5 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb a5, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: ret + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = shl i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + + +define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_16bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: or a3, a6, a7 +; RV64I-NEXT: bltu a1, a4, .LBB10_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: sra a5, a3, a5 +; RV64I-NEXT: bnez a1, .LBB10_3 +; RV64I-NEXT: j .LBB10_4 +; RV64I-NEXT: .LBB10_2: +; RV64I-NEXT: srl a5, a0, a1 +; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: beqz a1, .LBB10_4 +; RV64I-NEXT: .LBB10_3: +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB10_4: +; RV64I-NEXT: bltu a1, a4, .LBB10_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: j .LBB10_7 +; RV64I-NEXT: .LBB10_6: +; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: .LBB10_7: +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: srliw a4, a0, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli t1, a1, 32 +; RV64I-NEXT: srliw t2, a1, 16 +; RV64I-NEXT: srliw t3, a1, 24 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a0, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a0, a3, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a0, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a0, a1, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_16bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu t2, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t3, a7, a6 +; RV32I-NEXT: or t1, t1, t0 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t6, 1(a1) +; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or s1, a0, t0 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t6, t6, a7 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: slli a1, a5, 8 +; RV32I-NEXT: slli a0, t2, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or a5, s1, a6 +; RV32I-NEXT: or a6, s0, t6 +; RV32I-NEXT: slli a6, a6, 3 +; RV32I-NEXT: srl t2, a7, a6 +; RV32I-NEXT: neg t6, a6 +; RV32I-NEXT: sll t4, a5, t6 +; RV32I-NEXT: bltu a6, t0, .LBB10_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra t5, a5, a6 +; RV32I-NEXT: j .LBB10_3 +; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: or t5, t2, t4 +; RV32I-NEXT: .LBB10_3: +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: or a3, a0, a4 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: beqz a6, .LBB10_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a0, t5 +; RV32I-NEXT: .LBB10_5: +; RV32I-NEXT: or a4, t3, a1 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: bltu a6, t0, .LBB10_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srai a1, a5, 31 +; RV32I-NEXT: srl t5, a3, a6 +; RV32I-NEXT: j .LBB10_8 +; RV32I-NEXT: .LBB10_7: +; RV32I-NEXT: sra a1, a5, a6 +; RV32I-NEXT: srl t1, a4, a6 +; RV32I-NEXT: sll t3, a3, t6 +; RV32I-NEXT: or t5, t1, t3 +; RV32I-NEXT: .LBB10_8: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz a6, .LBB10_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: .LBB10_10: +; RV32I-NEXT: sub s0, t1, a6 +; RV32I-NEXT: bltu a6, t0, .LBB10_13 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: bgeu s0, t0, .LBB10_14 +; RV32I-NEXT: .LBB10_12: +; RV32I-NEXT: sll t6, a7, t6 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a7, s1 +; RV32I-NEXT: or s2, s1, t4 +; RV32I-NEXT: j .LBB10_15 +; RV32I-NEXT: .LBB10_13: +; RV32I-NEXT: srl t5, a3, a6 +; RV32I-NEXT: bltu s0, t0, .LBB10_12 +; RV32I-NEXT: .LBB10_14: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s2, a7, s0 +; RV32I-NEXT: .LBB10_15: +; RV32I-NEXT: addi s1, a6, -64 +; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: beqz s0, .LBB10_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t4, s2 +; RV32I-NEXT: .LBB10_17: +; RV32I-NEXT: bltu s1, t0, .LBB10_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: sra t2, a5, s1 +; RV32I-NEXT: bnez s1, .LBB10_20 +; RV32I-NEXT: j .LBB10_21 +; RV32I-NEXT: .LBB10_19: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, a5, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB10_21 +; RV32I-NEXT: .LBB10_20: +; RV32I-NEXT: mv a7, t2 +; RV32I-NEXT: .LBB10_21: +; RV32I-NEXT: bltu s1, t0, .LBB10_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: srai t0, a5, 31 +; RV32I-NEXT: bltu a6, t1, .LBB10_24 +; RV32I-NEXT: j .LBB10_25 +; RV32I-NEXT: .LBB10_23: +; RV32I-NEXT: sra t0, a5, a6 +; RV32I-NEXT: bgeu a6, t1, .LBB10_25 +; RV32I-NEXT: .LBB10_24: +; RV32I-NEXT: or a7, t3, t6 +; RV32I-NEXT: or t0, t5, t4 +; RV32I-NEXT: .LBB10_25: +; RV32I-NEXT: bnez a6, .LBB10_29 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: bltu a6, t1, .LBB10_28 +; RV32I-NEXT: .LBB10_27: +; RV32I-NEXT: srai a0, a5, 31 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB10_28: +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a0, 16 +; RV32I-NEXT: srli t3, a0, 24 +; RV32I-NEXT: srli t4, a1, 16 +; RV32I-NEXT: srli t5, a1, 24 +; RV32I-NEXT: addi a6, a6, -1 +; RV32I-NEXT: and t6, a4, a6 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, a3, a6 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a3, a0, a6 +; RV32I-NEXT: and a4, a1, a6 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB10_29: +; RV32I-NEXT: mv a4, a7 +; RV32I-NEXT: mv a3, t0 +; RV32I-NEXT: bgeu a6, t1, .LBB10_27 +; RV32I-NEXT: j .LBB10_28 + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = ashr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 13(a0) +; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: or t2, t6, t5 +; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: lbu t4, 1(a1) +; RV64I-NEXT: lbu t5, 2(a1) +; RV64I-NEXT: lbu t6, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a0, a0, s0 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t3, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu s0, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a0, a4, a3 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: li a4, 64 +; RV64I-NEXT: or a3, a6, a7 +; RV64I-NEXT: bltu a1, a4, .LBB11_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: sra a5, a3, a5 +; RV64I-NEXT: bnez a1, .LBB11_3 +; RV64I-NEXT: j .LBB11_4 +; RV64I-NEXT: .LBB11_2: +; RV64I-NEXT: srl a5, a0, a1 +; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: beqz a1, .LBB11_4 +; RV64I-NEXT: .LBB11_3: +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB11_4: +; RV64I-NEXT: bltu a1, a4, .LBB11_6 +; RV64I-NEXT: # %bb.5: +; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: j .LBB11_7 +; RV64I-NEXT: .LBB11_6: +; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: .LBB11_7: +; RV64I-NEXT: srli a3, a0, 32 +; RV64I-NEXT: srliw a4, a0, 16 +; RV64I-NEXT: lui a5, 16 +; RV64I-NEXT: srliw a6, a0, 24 +; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: srli t0, a0, 56 +; RV64I-NEXT: srli t1, a1, 32 +; RV64I-NEXT: srliw t2, a1, 16 +; RV64I-NEXT: srliw t3, a1, 24 +; RV64I-NEXT: srli t4, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: addi a5, a5, -1 +; RV64I-NEXT: and t6, a0, a5 +; RV64I-NEXT: srli t6, t6, 8 +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb a4, 2(a2) +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: and a0, a3, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: sb a0, 5(a2) +; RV64I-NEXT: sb a7, 6(a2) +; RV64I-NEXT: sb t0, 7(a2) +; RV64I-NEXT: and a0, a1, a5 +; RV64I-NEXT: and a3, t1, a5 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb t2, 10(a2) +; RV64I-NEXT: sb t3, 11(a2) +; RV64I-NEXT: sb t1, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t4, 14(a2) +; RV64I-NEXT: sb t5, 15(a2) +; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu t2, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t3, a7, a6 +; RV32I-NEXT: or t1, t1, t0 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu a7, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t6, 1(a1) +; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or s1, a0, t0 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or t6, t6, a7 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: li t0, 32 +; RV32I-NEXT: slli a1, a5, 8 +; RV32I-NEXT: slli a0, t2, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or a7, t5, t4 +; RV32I-NEXT: or a5, s1, a6 +; RV32I-NEXT: or a6, s0, t6 +; RV32I-NEXT: slli a6, a6, 5 +; RV32I-NEXT: srl t2, a7, a6 +; RV32I-NEXT: neg t6, a6 +; RV32I-NEXT: sll t4, a5, t6 +; RV32I-NEXT: bltu a6, t0, .LBB11_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra t5, a5, a6 +; RV32I-NEXT: j .LBB11_3 +; RV32I-NEXT: .LBB11_2: +; RV32I-NEXT: or t5, t2, t4 +; RV32I-NEXT: .LBB11_3: +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: or a3, a0, a4 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: beqz a6, .LBB11_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a0, t5 +; RV32I-NEXT: .LBB11_5: +; RV32I-NEXT: or a4, t3, a1 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: bltu a6, t0, .LBB11_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srai a1, a5, 31 +; RV32I-NEXT: srl t5, a3, a6 +; RV32I-NEXT: j .LBB11_8 +; RV32I-NEXT: .LBB11_7: +; RV32I-NEXT: sra a1, a5, a6 +; RV32I-NEXT: srl t1, a4, a6 +; RV32I-NEXT: sll t3, a3, t6 +; RV32I-NEXT: or t5, t1, t3 +; RV32I-NEXT: .LBB11_8: +; RV32I-NEXT: li t1, 64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz a6, .LBB11_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: .LBB11_10: +; RV32I-NEXT: sub s0, t1, a6 +; RV32I-NEXT: bltu a6, t0, .LBB11_13 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: bgeu s0, t0, .LBB11_14 +; RV32I-NEXT: .LBB11_12: +; RV32I-NEXT: sll t6, a7, t6 +; RV32I-NEXT: neg s1, s0 +; RV32I-NEXT: srl s1, a7, s1 +; RV32I-NEXT: or s2, s1, t4 +; RV32I-NEXT: j .LBB11_15 +; RV32I-NEXT: .LBB11_13: +; RV32I-NEXT: srl t5, a3, a6 +; RV32I-NEXT: bltu s0, t0, .LBB11_12 +; RV32I-NEXT: .LBB11_14: +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: sll s2, a7, s0 +; RV32I-NEXT: .LBB11_15: +; RV32I-NEXT: addi s1, a6, -64 +; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: beqz s0, .LBB11_17 +; RV32I-NEXT: # %bb.16: +; RV32I-NEXT: mv t4, s2 +; RV32I-NEXT: .LBB11_17: +; RV32I-NEXT: bltu s1, t0, .LBB11_19 +; RV32I-NEXT: # %bb.18: +; RV32I-NEXT: sra t2, a5, s1 +; RV32I-NEXT: bnez s1, .LBB11_20 +; RV32I-NEXT: j .LBB11_21 +; RV32I-NEXT: .LBB11_19: +; RV32I-NEXT: neg s0, s1 +; RV32I-NEXT: sll s0, a5, s0 +; RV32I-NEXT: or t2, t2, s0 +; RV32I-NEXT: beqz s1, .LBB11_21 +; RV32I-NEXT: .LBB11_20: +; RV32I-NEXT: mv a7, t2 +; RV32I-NEXT: .LBB11_21: +; RV32I-NEXT: bltu s1, t0, .LBB11_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: srai t0, a5, 31 +; RV32I-NEXT: bltu a6, t1, .LBB11_24 +; RV32I-NEXT: j .LBB11_25 +; RV32I-NEXT: .LBB11_23: +; RV32I-NEXT: sra t0, a5, a6 +; RV32I-NEXT: bgeu a6, t1, .LBB11_25 +; RV32I-NEXT: .LBB11_24: +; RV32I-NEXT: or a7, t3, t6 +; RV32I-NEXT: or t0, t5, t4 +; RV32I-NEXT: .LBB11_25: +; RV32I-NEXT: bnez a6, .LBB11_29 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: bltu a6, t1, .LBB11_28 +; RV32I-NEXT: .LBB11_27: +; RV32I-NEXT: srai a0, a5, 31 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: .LBB11_28: +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: srli t0, a3, 16 +; RV32I-NEXT: srli t1, a3, 24 +; RV32I-NEXT: srli t2, a0, 16 +; RV32I-NEXT: srli t3, a0, 24 +; RV32I-NEXT: srli t4, a1, 16 +; RV32I-NEXT: srli t5, a1, 24 +; RV32I-NEXT: addi a6, a6, -1 +; RV32I-NEXT: and t6, a4, a6 +; RV32I-NEXT: srli t6, t6, 8 +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb t6, 1(a2) +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, a3, a6 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: and a3, a0, a6 +; RV32I-NEXT: and a4, a1, a6 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: sb t4, 14(a2) +; RV32I-NEXT: sb t5, 15(a2) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB11_29: +; RV32I-NEXT: mv a4, a7 +; RV32I-NEXT: mv a3, t0 +; RV32I-NEXT: bgeu a6, t1, .LBB11_27 +; RV32I-NEXT: j .LBB11_28 + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = ashr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_32bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li a7, 64 +; RV64I-NEXT: slli t4, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a6, t1, t0 +; RV64I-NEXT: or t0, t5, t3 +; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli a5, a5, 3 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sll t3, t0, t5 +; RV64I-NEXT: bltu a5, a7, .LBB12_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: srl t6, t0, t1 +; RV64I-NEXT: j .LBB12_3 +; RV64I-NEXT: .LBB12_2: +; RV64I-NEXT: srl t6, a6, a5 +; RV64I-NEXT: or t6, t6, t3 +; RV64I-NEXT: .LBB12_3: +; RV64I-NEXT: or a3, t4, a3 +; RV64I-NEXT: slli t4, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a6 +; RV64I-NEXT: beqz a5, .LBB12_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB12_5: +; RV64I-NEXT: or a4, t4, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a5, a7, .LBB12_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: srl t4, a3, t1 +; RV64I-NEXT: j .LBB12_8 +; RV64I-NEXT: .LBB12_7: +; RV64I-NEXT: srl a0, t0, a5 +; RV64I-NEXT: srl t1, a4, a5 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t4, t1, t2 +; RV64I-NEXT: .LBB12_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a5, .LBB12_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t4 +; RV64I-NEXT: .LBB12_10: +; RV64I-NEXT: sub t6, t1, a5 +; RV64I-NEXT: bltu a5, a7, .LBB12_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t4, 0 +; RV64I-NEXT: bgeu t6, a7, .LBB12_14 +; RV64I-NEXT: .LBB12_12: +; RV64I-NEXT: sll t5, a6, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a6, s0 +; RV64I-NEXT: or s1, s0, t3 +; RV64I-NEXT: j .LBB12_15 +; RV64I-NEXT: .LBB12_13: +; RV64I-NEXT: srl t4, a3, a5 +; RV64I-NEXT: bltu t6, a7, .LBB12_12 +; RV64I-NEXT: .LBB12_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: sll s1, a6, t3 +; RV64I-NEXT: .LBB12_15: +; RV64I-NEXT: sub s0, a5, t1 +; RV64I-NEXT: mv t3, t0 +; RV64I-NEXT: beqz t6, .LBB12_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t3, s1 +; RV64I-NEXT: .LBB12_17: +; RV64I-NEXT: bltu s0, a7, .LBB12_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: srl t6, t0, t6 +; RV64I-NEXT: bnez s0, .LBB12_20 +; RV64I-NEXT: j .LBB12_21 +; RV64I-NEXT: .LBB12_19: +; RV64I-NEXT: srl t6, a6, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, t0, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB12_21 +; RV64I-NEXT: .LBB12_20: +; RV64I-NEXT: mv a6, t6 +; RV64I-NEXT: .LBB12_21: +; RV64I-NEXT: bltu s0, a7, .LBB12_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: li a7, 0 +; RV64I-NEXT: bltu a5, t1, .LBB12_24 +; RV64I-NEXT: j .LBB12_25 +; RV64I-NEXT: .LBB12_23: +; RV64I-NEXT: srl a7, t0, s0 +; RV64I-NEXT: bgeu a5, t1, .LBB12_25 +; RV64I-NEXT: .LBB12_24: +; RV64I-NEXT: or a6, t2, t5 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: .LBB12_25: +; RV64I-NEXT: bnez a5, .LBB12_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a5, t1, .LBB12_28 +; RV64I-NEXT: .LBB12_27: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: .LBB12_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB12_29: +; RV64I-NEXT: mv a4, a6 +; RV64I-NEXT: mv a3, a7 +; RV64I-NEXT: bgeu a5, t1, .LBB12_27 +; RV64I-NEXT: j .LBB12_28 +; +; RV32I-LABEL: lshr_32bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a6, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t4, t4, t3 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t3, 32 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a1, a4, 16 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a4, t1, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: or t1, t5, t4 +; RV32I-NEXT: or t5, a4, a7 +; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a4, a4, 3 +; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: neg s6, a4 +; RV32I-NEXT: sll t4, t5, s6 +; RV32I-NEXT: bltu a4, t3, .LBB12_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a7, t5, a4 +; RV32I-NEXT: j .LBB12_3 +; RV32I-NEXT: .LBB12_2: +; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: .LBB12_3: +; RV32I-NEXT: or t0, a6, a3 +; RV32I-NEXT: or a6, a1, a5 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: beqz a4, .LBB12_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: .LBB12_5: +; RV32I-NEXT: srl a3, t0, a4 +; RV32I-NEXT: sll a5, a6, s6 +; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t3, .LBB12_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: srl a3, a6, a4 +; RV32I-NEXT: j .LBB12_8 +; RV32I-NEXT: .LBB12_7: +; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: .LBB12_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv a7, t0 +; RV32I-NEXT: beqz a4, .LBB12_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: .LBB12_10: +; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s7, t6, a4 +; RV32I-NEXT: bltu a4, t3, .LBB12_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a5, 0 +; RV32I-NEXT: j .LBB12_13 +; RV32I-NEXT: .LBB12_12: +; RV32I-NEXT: srl a5, a6, a4 +; RV32I-NEXT: .LBB12_13: +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t3, .LBB12_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li t2, 0 +; RV32I-NEXT: sll a3, t1, s7 +; RV32I-NEXT: j .LBB12_16 +; RV32I-NEXT: .LBB12_15: +; RV32I-NEXT: sll t2, t1, s6 +; RV32I-NEXT: srl a3, t1, s10 +; RV32I-NEXT: or a3, a3, t4 +; RV32I-NEXT: .LBB12_16: +; RV32I-NEXT: addi s9, a4, -64 +; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: beqz s7, .LBB12_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t4, a3 +; RV32I-NEXT: .LBB12_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB12_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: j .LBB12_21 +; RV32I-NEXT: .LBB12_20: +; RV32I-NEXT: sll a3, t5, s11 +; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: .LBB12_21: +; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: beqz s9, .LBB12_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: .LBB12_23: +; RV32I-NEXT: lbu s4, 9(a0) +; RV32I-NEXT: lbu s2, 10(a0) +; RV32I-NEXT: lbu s5, 13(a0) +; RV32I-NEXT: lbu s8, 14(a0) +; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB12_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: j .LBB12_26 +; RV32I-NEXT: .LBB12_25: +; RV32I-NEXT: srl s1, t5, a4 +; RV32I-NEXT: .LBB12_26: +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: lbu ra, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: bgeu a4, t6, .LBB12_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or s0, a7, t2 +; RV32I-NEXT: or s1, a5, t4 +; RV32I-NEXT: .LBB12_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: slli t4, s2, 16 +; RV32I-NEXT: or s2, s5, s3 +; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: mv s4, t0 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: beqz a4, .LBB12_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv s4, s0 +; RV32I-NEXT: mv a7, s1 +; RV32I-NEXT: .LBB12_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s1, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: or t4, t4, a5 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: bltu a4, t6, .LBB12_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB12_32: +; RV32I-NEXT: slli s3, ra, 8 +; RV32I-NEXT: or a5, s5, a3 +; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: srl s2, t4, a4 +; RV32I-NEXT: sll ra, t2, s6 +; RV32I-NEXT: bltu a4, t3, .LBB12_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s0, t2, a4 +; RV32I-NEXT: j .LBB12_35 +; RV32I-NEXT: .LBB12_34: +; RV32I-NEXT: or s0, s2, ra +; RV32I-NEXT: .LBB12_35: +; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a4, .LBB12_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s0 +; RV32I-NEXT: .LBB12_37: +; RV32I-NEXT: or s0, a5, s3 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: bltu a4, t3, .LBB12_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: srl a3, a0, a4 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: bnez a4, .LBB12_40 +; RV32I-NEXT: j .LBB12_41 +; RV32I-NEXT: .LBB12_39: +; RV32I-NEXT: srl s8, t2, a4 +; RV32I-NEXT: srl a3, s0, a4 +; RV32I-NEXT: sll a5, a0, s6 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: beqz a4, .LBB12_41 +; RV32I-NEXT: .LBB12_40: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB12_41: +; RV32I-NEXT: bltu a4, t3, .LBB12_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t3, .LBB12_45 +; RV32I-NEXT: .LBB12_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: bnez s7, .LBB12_46 +; RV32I-NEXT: j .LBB12_47 +; RV32I-NEXT: .LBB12_44: +; RV32I-NEXT: srl s1, a0, a4 +; RV32I-NEXT: bltu s7, t3, .LBB12_43 +; RV32I-NEXT: .LBB12_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: beqz s7, .LBB12_47 +; RV32I-NEXT: .LBB12_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB12_47: +; RV32I-NEXT: bltu s9, t3, .LBB12_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t2, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB12_50 +; RV32I-NEXT: j .LBB12_51 +; RV32I-NEXT: .LBB12_49: +; RV32I-NEXT: sll a3, t2, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB12_51 +; RV32I-NEXT: .LBB12_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB12_51: +; RV32I-NEXT: bltu s9, t3, .LBB12_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a4, t6, .LBB12_54 +; RV32I-NEXT: j .LBB12_55 +; RV32I-NEXT: .LBB12_53: +; RV32I-NEXT: srl s7, t2, a4 +; RV32I-NEXT: bgeu a4, t6, .LBB12_55 +; RV32I-NEXT: .LBB12_54: +; RV32I-NEXT: or s2, a5, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB12_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a4, .LBB12_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a5, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB12_57: +; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a4 +; RV32I-NEXT: bltu a4, t6, .LBB12_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: .LBB12_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a5, t0, s3 +; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB12_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a3, t0, s2 +; RV32I-NEXT: j .LBB12_62 +; RV32I-NEXT: .LBB12_61: +; RV32I-NEXT: sll s10, t0, s6 +; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: .LBB12_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a6 +; RV32I-NEXT: beqz s2, .LBB12_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB12_64: +; RV32I-NEXT: bltu s1, t3, .LBB12_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a6, s1 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez s1, .LBB12_67 +; RV32I-NEXT: j .LBB12_68 +; RV32I-NEXT: .LBB12_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a6, a3 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz s1, .LBB12_68 +; RV32I-NEXT: .LBB12_67: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB12_68: +; RV32I-NEXT: bltu s1, t3, .LBB12_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t3, .LBB12_72 +; RV32I-NEXT: .LBB12_70: +; RV32I-NEXT: sll s6, t1, s6 +; RV32I-NEXT: srl a3, t1, s3 +; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB12_73 +; RV32I-NEXT: .LBB12_71: +; RV32I-NEXT: srl s1, a6, s3 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB12_70 +; RV32I-NEXT: .LBB12_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: .LBB12_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, t5 +; RV32I-NEXT: beqz s2, .LBB12_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB12_75: +; RV32I-NEXT: bltu s9, t3, .LBB12_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t0, s9 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: bnez s9, .LBB12_78 +; RV32I-NEXT: j .LBB12_79 +; RV32I-NEXT: .LBB12_77: +; RV32I-NEXT: sll s3, t0, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t0, a3 +; RV32I-NEXT: sll s7, a6, s2 +; RV32I-NEXT: or a3, a3, s7 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: beqz s9, .LBB12_79 +; RV32I-NEXT: .LBB12_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB12_79: +; RV32I-NEXT: bltu s2, t6, .LBB12_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB12_82 +; RV32I-NEXT: .LBB12_81: +; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s3, a5, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB12_82: +; RV32I-NEXT: addi ra, a4, -128 +; RV32I-NEXT: mv s5, t1 +; RV32I-NEXT: mv s6, t5 +; RV32I-NEXT: beqz s2, .LBB12_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s5, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB12_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, t5, s9 +; RV32I-NEXT: bltu ra, t3, .LBB12_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: srl a3, t5, ra +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: bnez ra, .LBB12_87 +; RV32I-NEXT: j .LBB12_88 +; RV32I-NEXT: .LBB12_86: +; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: beqz ra, .LBB12_88 +; RV32I-NEXT: .LBB12_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB12_88: +; RV32I-NEXT: bltu ra, t3, .LBB12_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: srl a3, a6, ra +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez ra, .LBB12_91 +; RV32I-NEXT: j .LBB12_92 +; RV32I-NEXT: .LBB12_90: +; RV32I-NEXT: srl s2, t5, a4 +; RV32I-NEXT: sll a3, a6, s9 +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz ra, .LBB12_92 +; RV32I-NEXT: .LBB12_91: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB12_92: +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: bltu ra, t3, .LBB12_95 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bgeu s10, t3, .LBB12_96 +; RV32I-NEXT: .LBB12_94: +; RV32I-NEXT: sll s9, t1, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB12_97 +; RV32I-NEXT: .LBB12_95: +; RV32I-NEXT: srl s7, a6, a4 +; RV32I-NEXT: bltu s10, t3, .LBB12_94 +; RV32I-NEXT: .LBB12_96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t1, s10 +; RV32I-NEXT: .LBB12_97: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, t5 +; RV32I-NEXT: beqz s10, .LBB12_99 +; RV32I-NEXT: # %bb.98: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB12_99: +; RV32I-NEXT: bltu s11, t3, .LBB12_101 +; RV32I-NEXT: # %bb.100: +; RV32I-NEXT: srl a3, t5, s11 +; RV32I-NEXT: bnez s11, .LBB12_102 +; RV32I-NEXT: j .LBB12_103 +; RV32I-NEXT: .LBB12_101: +; RV32I-NEXT: srl a3, t1, ra +; RV32I-NEXT: neg s10, s11 +; RV32I-NEXT: sll s10, t5, s10 +; RV32I-NEXT: or a3, a3, s10 +; RV32I-NEXT: beqz s11, .LBB12_103 +; RV32I-NEXT: .LBB12_102: +; RV32I-NEXT: mv t1, a3 +; RV32I-NEXT: .LBB12_103: +; RV32I-NEXT: bltu s11, t3, .LBB12_105 +; RV32I-NEXT: # %bb.104: +; RV32I-NEXT: li t3, 0 +; RV32I-NEXT: bltu ra, t6, .LBB12_106 +; RV32I-NEXT: j .LBB12_107 +; RV32I-NEXT: .LBB12_105: +; RV32I-NEXT: srl t3, t5, ra +; RV32I-NEXT: bgeu ra, t6, .LBB12_107 +; RV32I-NEXT: .LBB12_106: +; RV32I-NEXT: or t1, a5, s9 +; RV32I-NEXT: or t3, s7, s3 +; RV32I-NEXT: .LBB12_107: +; RV32I-NEXT: li a5, 128 +; RV32I-NEXT: bnez ra, .LBB12_114 +; RV32I-NEXT: # %bb.108: +; RV32I-NEXT: bgeu ra, t6, .LBB12_115 +; RV32I-NEXT: .LBB12_109: +; RV32I-NEXT: bltu a4, a5, .LBB12_116 +; RV32I-NEXT: .LBB12_110: +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bnez a4, .LBB12_117 +; RV32I-NEXT: .LBB12_111: +; RV32I-NEXT: bltu a4, a5, .LBB12_113 +; RV32I-NEXT: .LBB12_112: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: .LBB12_113: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli t0, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t3, t2, 16 +; RV32I-NEXT: srli s3, t2, 24 +; RV32I-NEXT: srli s1, s4, 16 +; RV32I-NEXT: srli a3, s4, 24 +; RV32I-NEXT: srli t6, a7, 16 +; RV32I-NEXT: srli s6, a7, 24 +; RV32I-NEXT: srli s5, a1, 16 +; RV32I-NEXT: srli s7, a1, 24 +; RV32I-NEXT: srli s8, ra, 16 +; RV32I-NEXT: srli s9, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s10, s0, t1 +; RV32I-NEXT: and s11, a0, t1 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli t0, s11, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t2, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, s4, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t2, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, a7, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s4, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a3, a1, t1 +; RV32I-NEXT: and a4, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb a1, 24(a2) +; RV32I-NEXT: sb a3, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB12_114: +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: bltu ra, t6, .LBB12_109 +; RV32I-NEXT: .LBB12_115: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: bgeu a4, a5, .LBB12_110 +; RV32I-NEXT: .LBB12_116: +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t0, a3, a6 +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a3, s8 +; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s5 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB12_111 +; RV32I-NEXT: .LBB12_117: +; RV32I-NEXT: mv s0, t0 +; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t2, s2 +; RV32I-NEXT: bgeu a4, a5, .LBB12_112 +; RV32I-NEXT: j .LBB12_113 + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_32bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li a7, 64 +; RV64I-NEXT: slli t4, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a6, t1, t0 +; RV64I-NEXT: or t0, t5, t3 +; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli a5, a5, 5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sll t3, t0, t5 +; RV64I-NEXT: bltu a5, a7, .LBB13_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: srl t6, t0, t1 +; RV64I-NEXT: j .LBB13_3 +; RV64I-NEXT: .LBB13_2: +; RV64I-NEXT: srl t6, a6, a5 +; RV64I-NEXT: or t6, t6, t3 +; RV64I-NEXT: .LBB13_3: +; RV64I-NEXT: or a3, t4, a3 +; RV64I-NEXT: slli t4, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a6 +; RV64I-NEXT: beqz a5, .LBB13_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB13_5: +; RV64I-NEXT: or a4, t4, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a5, a7, .LBB13_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: srl t4, a3, t1 +; RV64I-NEXT: j .LBB13_8 +; RV64I-NEXT: .LBB13_7: +; RV64I-NEXT: srl a0, t0, a5 +; RV64I-NEXT: srl t1, a4, a5 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t4, t1, t2 +; RV64I-NEXT: .LBB13_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a5, .LBB13_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t4 +; RV64I-NEXT: .LBB13_10: +; RV64I-NEXT: sub t6, t1, a5 +; RV64I-NEXT: bltu a5, a7, .LBB13_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t4, 0 +; RV64I-NEXT: bgeu t6, a7, .LBB13_14 +; RV64I-NEXT: .LBB13_12: +; RV64I-NEXT: sll t5, a6, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a6, s0 +; RV64I-NEXT: or s1, s0, t3 +; RV64I-NEXT: j .LBB13_15 +; RV64I-NEXT: .LBB13_13: +; RV64I-NEXT: srl t4, a3, a5 +; RV64I-NEXT: bltu t6, a7, .LBB13_12 +; RV64I-NEXT: .LBB13_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: sll s1, a6, t3 +; RV64I-NEXT: .LBB13_15: +; RV64I-NEXT: sub s0, a5, t1 +; RV64I-NEXT: mv t3, t0 +; RV64I-NEXT: beqz t6, .LBB13_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t3, s1 +; RV64I-NEXT: .LBB13_17: +; RV64I-NEXT: bltu s0, a7, .LBB13_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: srl t6, t0, t6 +; RV64I-NEXT: bnez s0, .LBB13_20 +; RV64I-NEXT: j .LBB13_21 +; RV64I-NEXT: .LBB13_19: +; RV64I-NEXT: srl t6, a6, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, t0, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB13_21 +; RV64I-NEXT: .LBB13_20: +; RV64I-NEXT: mv a6, t6 +; RV64I-NEXT: .LBB13_21: +; RV64I-NEXT: bltu s0, a7, .LBB13_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: li a7, 0 +; RV64I-NEXT: bltu a5, t1, .LBB13_24 +; RV64I-NEXT: j .LBB13_25 +; RV64I-NEXT: .LBB13_23: +; RV64I-NEXT: srl a7, t0, s0 +; RV64I-NEXT: bgeu a5, t1, .LBB13_25 +; RV64I-NEXT: .LBB13_24: +; RV64I-NEXT: or a6, t2, t5 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: .LBB13_25: +; RV64I-NEXT: bnez a5, .LBB13_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a5, t1, .LBB13_28 +; RV64I-NEXT: .LBB13_27: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: .LBB13_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB13_29: +; RV64I-NEXT: mv a4, a6 +; RV64I-NEXT: mv a3, a7 +; RV64I-NEXT: bgeu a5, t1, .LBB13_27 +; RV64I-NEXT: j .LBB13_28 +; +; RV32I-LABEL: lshr_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a6, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t4, t4, t3 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t3, 32 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a1, a4, 16 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a4, t1, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: or t1, t5, t4 +; RV32I-NEXT: or t5, a4, a7 +; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a4, a4, 5 +; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: neg s6, a4 +; RV32I-NEXT: sll t4, t5, s6 +; RV32I-NEXT: bltu a4, t3, .LBB13_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a7, t5, a4 +; RV32I-NEXT: j .LBB13_3 +; RV32I-NEXT: .LBB13_2: +; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: .LBB13_3: +; RV32I-NEXT: or t0, a6, a3 +; RV32I-NEXT: or a6, a1, a5 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: beqz a4, .LBB13_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: .LBB13_5: +; RV32I-NEXT: srl a3, t0, a4 +; RV32I-NEXT: sll a5, a6, s6 +; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t3, .LBB13_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: srl a3, a6, a4 +; RV32I-NEXT: j .LBB13_8 +; RV32I-NEXT: .LBB13_7: +; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: .LBB13_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv a7, t0 +; RV32I-NEXT: beqz a4, .LBB13_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: .LBB13_10: +; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s7, t6, a4 +; RV32I-NEXT: bltu a4, t3, .LBB13_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a5, 0 +; RV32I-NEXT: j .LBB13_13 +; RV32I-NEXT: .LBB13_12: +; RV32I-NEXT: srl a5, a6, a4 +; RV32I-NEXT: .LBB13_13: +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t3, .LBB13_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li t2, 0 +; RV32I-NEXT: sll a3, t1, s7 +; RV32I-NEXT: j .LBB13_16 +; RV32I-NEXT: .LBB13_15: +; RV32I-NEXT: sll t2, t1, s6 +; RV32I-NEXT: srl a3, t1, s10 +; RV32I-NEXT: or a3, a3, t4 +; RV32I-NEXT: .LBB13_16: +; RV32I-NEXT: addi s9, a4, -64 +; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: beqz s7, .LBB13_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t4, a3 +; RV32I-NEXT: .LBB13_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB13_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: j .LBB13_21 +; RV32I-NEXT: .LBB13_20: +; RV32I-NEXT: sll a3, t5, s11 +; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: .LBB13_21: +; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: beqz s9, .LBB13_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: .LBB13_23: +; RV32I-NEXT: lbu s4, 9(a0) +; RV32I-NEXT: lbu s2, 10(a0) +; RV32I-NEXT: lbu s5, 13(a0) +; RV32I-NEXT: lbu s8, 14(a0) +; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB13_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: j .LBB13_26 +; RV32I-NEXT: .LBB13_25: +; RV32I-NEXT: srl s1, t5, a4 +; RV32I-NEXT: .LBB13_26: +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: lbu ra, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: bgeu a4, t6, .LBB13_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or s0, a7, t2 +; RV32I-NEXT: or s1, a5, t4 +; RV32I-NEXT: .LBB13_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: slli t4, s2, 16 +; RV32I-NEXT: or s2, s5, s3 +; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: mv s4, t0 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: beqz a4, .LBB13_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv s4, s0 +; RV32I-NEXT: mv a7, s1 +; RV32I-NEXT: .LBB13_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s1, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: or t4, t4, a5 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: bltu a4, t6, .LBB13_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB13_32: +; RV32I-NEXT: slli s3, ra, 8 +; RV32I-NEXT: or a5, s5, a3 +; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: srl s2, t4, a4 +; RV32I-NEXT: sll ra, t2, s6 +; RV32I-NEXT: bltu a4, t3, .LBB13_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s0, t2, a4 +; RV32I-NEXT: j .LBB13_35 +; RV32I-NEXT: .LBB13_34: +; RV32I-NEXT: or s0, s2, ra +; RV32I-NEXT: .LBB13_35: +; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a4, .LBB13_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s0 +; RV32I-NEXT: .LBB13_37: +; RV32I-NEXT: or s0, a5, s3 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: bltu a4, t3, .LBB13_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: srl a3, a0, a4 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: bnez a4, .LBB13_40 +; RV32I-NEXT: j .LBB13_41 +; RV32I-NEXT: .LBB13_39: +; RV32I-NEXT: srl s8, t2, a4 +; RV32I-NEXT: srl a3, s0, a4 +; RV32I-NEXT: sll a5, a0, s6 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: beqz a4, .LBB13_41 +; RV32I-NEXT: .LBB13_40: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB13_41: +; RV32I-NEXT: bltu a4, t3, .LBB13_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t3, .LBB13_45 +; RV32I-NEXT: .LBB13_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: bnez s7, .LBB13_46 +; RV32I-NEXT: j .LBB13_47 +; RV32I-NEXT: .LBB13_44: +; RV32I-NEXT: srl s1, a0, a4 +; RV32I-NEXT: bltu s7, t3, .LBB13_43 +; RV32I-NEXT: .LBB13_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: beqz s7, .LBB13_47 +; RV32I-NEXT: .LBB13_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB13_47: +; RV32I-NEXT: bltu s9, t3, .LBB13_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t2, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB13_50 +; RV32I-NEXT: j .LBB13_51 +; RV32I-NEXT: .LBB13_49: +; RV32I-NEXT: sll a3, t2, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB13_51 +; RV32I-NEXT: .LBB13_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB13_51: +; RV32I-NEXT: bltu s9, t3, .LBB13_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a4, t6, .LBB13_54 +; RV32I-NEXT: j .LBB13_55 +; RV32I-NEXT: .LBB13_53: +; RV32I-NEXT: srl s7, t2, a4 +; RV32I-NEXT: bgeu a4, t6, .LBB13_55 +; RV32I-NEXT: .LBB13_54: +; RV32I-NEXT: or s2, a5, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB13_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a4, .LBB13_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a5, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB13_57: +; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a4 +; RV32I-NEXT: bltu a4, t6, .LBB13_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: .LBB13_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a5, t0, s3 +; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB13_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a3, t0, s2 +; RV32I-NEXT: j .LBB13_62 +; RV32I-NEXT: .LBB13_61: +; RV32I-NEXT: sll s10, t0, s6 +; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: .LBB13_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a6 +; RV32I-NEXT: beqz s2, .LBB13_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB13_64: +; RV32I-NEXT: bltu s1, t3, .LBB13_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a6, s1 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez s1, .LBB13_67 +; RV32I-NEXT: j .LBB13_68 +; RV32I-NEXT: .LBB13_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a6, a3 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz s1, .LBB13_68 +; RV32I-NEXT: .LBB13_67: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB13_68: +; RV32I-NEXT: bltu s1, t3, .LBB13_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t3, .LBB13_72 +; RV32I-NEXT: .LBB13_70: +; RV32I-NEXT: sll s6, t1, s6 +; RV32I-NEXT: srl a3, t1, s3 +; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB13_73 +; RV32I-NEXT: .LBB13_71: +; RV32I-NEXT: srl s1, a6, s3 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB13_70 +; RV32I-NEXT: .LBB13_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: .LBB13_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, t5 +; RV32I-NEXT: beqz s2, .LBB13_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB13_75: +; RV32I-NEXT: bltu s9, t3, .LBB13_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t0, s9 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: bnez s9, .LBB13_78 +; RV32I-NEXT: j .LBB13_79 +; RV32I-NEXT: .LBB13_77: +; RV32I-NEXT: sll s3, t0, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t0, a3 +; RV32I-NEXT: sll s7, a6, s2 +; RV32I-NEXT: or a3, a3, s7 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: beqz s9, .LBB13_79 +; RV32I-NEXT: .LBB13_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB13_79: +; RV32I-NEXT: bltu s2, t6, .LBB13_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB13_82 +; RV32I-NEXT: .LBB13_81: +; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s3, a5, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB13_82: +; RV32I-NEXT: addi ra, a4, -128 +; RV32I-NEXT: mv s5, t1 +; RV32I-NEXT: mv s6, t5 +; RV32I-NEXT: beqz s2, .LBB13_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s5, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB13_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, t5, s9 +; RV32I-NEXT: bltu ra, t3, .LBB13_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: srl a3, t5, ra +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: bnez ra, .LBB13_87 +; RV32I-NEXT: j .LBB13_88 +; RV32I-NEXT: .LBB13_86: +; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: beqz ra, .LBB13_88 +; RV32I-NEXT: .LBB13_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB13_88: +; RV32I-NEXT: bltu ra, t3, .LBB13_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: srl a3, a6, ra +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez ra, .LBB13_91 +; RV32I-NEXT: j .LBB13_92 +; RV32I-NEXT: .LBB13_90: +; RV32I-NEXT: srl s2, t5, a4 +; RV32I-NEXT: sll a3, a6, s9 +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz ra, .LBB13_92 +; RV32I-NEXT: .LBB13_91: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB13_92: +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: bltu ra, t3, .LBB13_95 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bgeu s10, t3, .LBB13_96 +; RV32I-NEXT: .LBB13_94: +; RV32I-NEXT: sll s9, t1, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB13_97 +; RV32I-NEXT: .LBB13_95: +; RV32I-NEXT: srl s7, a6, a4 +; RV32I-NEXT: bltu s10, t3, .LBB13_94 +; RV32I-NEXT: .LBB13_96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t1, s10 +; RV32I-NEXT: .LBB13_97: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, t5 +; RV32I-NEXT: beqz s10, .LBB13_99 +; RV32I-NEXT: # %bb.98: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB13_99: +; RV32I-NEXT: bltu s11, t3, .LBB13_101 +; RV32I-NEXT: # %bb.100: +; RV32I-NEXT: srl a3, t5, s11 +; RV32I-NEXT: bnez s11, .LBB13_102 +; RV32I-NEXT: j .LBB13_103 +; RV32I-NEXT: .LBB13_101: +; RV32I-NEXT: srl a3, t1, ra +; RV32I-NEXT: neg s10, s11 +; RV32I-NEXT: sll s10, t5, s10 +; RV32I-NEXT: or a3, a3, s10 +; RV32I-NEXT: beqz s11, .LBB13_103 +; RV32I-NEXT: .LBB13_102: +; RV32I-NEXT: mv t1, a3 +; RV32I-NEXT: .LBB13_103: +; RV32I-NEXT: bltu s11, t3, .LBB13_105 +; RV32I-NEXT: # %bb.104: +; RV32I-NEXT: li t3, 0 +; RV32I-NEXT: bltu ra, t6, .LBB13_106 +; RV32I-NEXT: j .LBB13_107 +; RV32I-NEXT: .LBB13_105: +; RV32I-NEXT: srl t3, t5, ra +; RV32I-NEXT: bgeu ra, t6, .LBB13_107 +; RV32I-NEXT: .LBB13_106: +; RV32I-NEXT: or t1, a5, s9 +; RV32I-NEXT: or t3, s7, s3 +; RV32I-NEXT: .LBB13_107: +; RV32I-NEXT: li a5, 128 +; RV32I-NEXT: bnez ra, .LBB13_114 +; RV32I-NEXT: # %bb.108: +; RV32I-NEXT: bgeu ra, t6, .LBB13_115 +; RV32I-NEXT: .LBB13_109: +; RV32I-NEXT: bltu a4, a5, .LBB13_116 +; RV32I-NEXT: .LBB13_110: +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bnez a4, .LBB13_117 +; RV32I-NEXT: .LBB13_111: +; RV32I-NEXT: bltu a4, a5, .LBB13_113 +; RV32I-NEXT: .LBB13_112: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: .LBB13_113: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli t0, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t3, t2, 16 +; RV32I-NEXT: srli s3, t2, 24 +; RV32I-NEXT: srli s1, s4, 16 +; RV32I-NEXT: srli a3, s4, 24 +; RV32I-NEXT: srli t6, a7, 16 +; RV32I-NEXT: srli s6, a7, 24 +; RV32I-NEXT: srli s5, a1, 16 +; RV32I-NEXT: srli s7, a1, 24 +; RV32I-NEXT: srli s8, ra, 16 +; RV32I-NEXT: srli s9, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s10, s0, t1 +; RV32I-NEXT: and s11, a0, t1 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli t0, s11, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t2, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, s4, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t2, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, a7, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s4, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a3, a1, t1 +; RV32I-NEXT: and a4, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb a1, 24(a2) +; RV32I-NEXT: sb a3, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB13_114: +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: bltu ra, t6, .LBB13_109 +; RV32I-NEXT: .LBB13_115: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: bgeu a4, a5, .LBB13_110 +; RV32I-NEXT: .LBB13_116: +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t0, a3, a6 +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a3, s8 +; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s5 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB13_111 +; RV32I-NEXT: .LBB13_117: +; RV32I-NEXT: mv s0, t0 +; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t2, s2 +; RV32I-NEXT: bgeu a4, a5, .LBB13_112 +; RV32I-NEXT: j .LBB13_113 + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li a7, 64 +; RV64I-NEXT: slli t4, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a6, t1, t0 +; RV64I-NEXT: or t0, t5, t3 +; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli a5, a5, 6 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sll t3, t0, t5 +; RV64I-NEXT: bltu a5, a7, .LBB14_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: srl t6, t0, t1 +; RV64I-NEXT: j .LBB14_3 +; RV64I-NEXT: .LBB14_2: +; RV64I-NEXT: srl t6, a6, a5 +; RV64I-NEXT: or t6, t6, t3 +; RV64I-NEXT: .LBB14_3: +; RV64I-NEXT: or a3, t4, a3 +; RV64I-NEXT: slli t4, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a6 +; RV64I-NEXT: beqz a5, .LBB14_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB14_5: +; RV64I-NEXT: or a4, t4, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a5, a7, .LBB14_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: srl t4, a3, t1 +; RV64I-NEXT: j .LBB14_8 +; RV64I-NEXT: .LBB14_7: +; RV64I-NEXT: srl a0, t0, a5 +; RV64I-NEXT: srl t1, a4, a5 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t4, t1, t2 +; RV64I-NEXT: .LBB14_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a5, .LBB14_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t4 +; RV64I-NEXT: .LBB14_10: +; RV64I-NEXT: sub t6, t1, a5 +; RV64I-NEXT: bltu a5, a7, .LBB14_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t4, 0 +; RV64I-NEXT: bgeu t6, a7, .LBB14_14 +; RV64I-NEXT: .LBB14_12: +; RV64I-NEXT: sll t5, a6, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a6, s0 +; RV64I-NEXT: or s1, s0, t3 +; RV64I-NEXT: j .LBB14_15 +; RV64I-NEXT: .LBB14_13: +; RV64I-NEXT: srl t4, a3, a5 +; RV64I-NEXT: bltu t6, a7, .LBB14_12 +; RV64I-NEXT: .LBB14_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: sll s1, a6, t3 +; RV64I-NEXT: .LBB14_15: +; RV64I-NEXT: sub s0, a5, t1 +; RV64I-NEXT: mv t3, t0 +; RV64I-NEXT: beqz t6, .LBB14_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t3, s1 +; RV64I-NEXT: .LBB14_17: +; RV64I-NEXT: bltu s0, a7, .LBB14_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: srl t6, t0, t6 +; RV64I-NEXT: bnez s0, .LBB14_20 +; RV64I-NEXT: j .LBB14_21 +; RV64I-NEXT: .LBB14_19: +; RV64I-NEXT: srl t6, a6, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, t0, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB14_21 +; RV64I-NEXT: .LBB14_20: +; RV64I-NEXT: mv a6, t6 +; RV64I-NEXT: .LBB14_21: +; RV64I-NEXT: bltu s0, a7, .LBB14_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: li a7, 0 +; RV64I-NEXT: bltu a5, t1, .LBB14_24 +; RV64I-NEXT: j .LBB14_25 +; RV64I-NEXT: .LBB14_23: +; RV64I-NEXT: srl a7, t0, s0 +; RV64I-NEXT: bgeu a5, t1, .LBB14_25 +; RV64I-NEXT: .LBB14_24: +; RV64I-NEXT: or a6, t2, t5 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: .LBB14_25: +; RV64I-NEXT: bnez a5, .LBB14_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a5, t1, .LBB14_28 +; RV64I-NEXT: .LBB14_27: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: .LBB14_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB14_29: +; RV64I-NEXT: mv a4, a6 +; RV64I-NEXT: mv a3, a7 +; RV64I-NEXT: bgeu a5, t1, .LBB14_27 +; RV64I-NEXT: j .LBB14_28 +; +; RV32I-LABEL: lshr_32bytes_dwordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a6, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t4, t4, t3 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t3, 32 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a1, a4, 16 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a4, t1, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: or t1, t5, t4 +; RV32I-NEXT: or t5, a4, a7 +; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a4, a4, 6 +; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: neg s6, a4 +; RV32I-NEXT: sll t4, t5, s6 +; RV32I-NEXT: bltu a4, t3, .LBB14_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srl a7, t5, a4 +; RV32I-NEXT: j .LBB14_3 +; RV32I-NEXT: .LBB14_2: +; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: .LBB14_3: +; RV32I-NEXT: or t0, a6, a3 +; RV32I-NEXT: or a6, a1, a5 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: beqz a4, .LBB14_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: .LBB14_5: +; RV32I-NEXT: srl a3, t0, a4 +; RV32I-NEXT: sll a5, a6, s6 +; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t3, .LBB14_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: srl a3, a6, a4 +; RV32I-NEXT: j .LBB14_8 +; RV32I-NEXT: .LBB14_7: +; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: .LBB14_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv a7, t0 +; RV32I-NEXT: beqz a4, .LBB14_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: .LBB14_10: +; RV32I-NEXT: sw a5, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s7, t6, a4 +; RV32I-NEXT: bltu a4, t3, .LBB14_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a5, 0 +; RV32I-NEXT: j .LBB14_13 +; RV32I-NEXT: .LBB14_12: +; RV32I-NEXT: srl a5, a6, a4 +; RV32I-NEXT: .LBB14_13: +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: sw t4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t3, .LBB14_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li t2, 0 +; RV32I-NEXT: sll a3, t1, s7 +; RV32I-NEXT: j .LBB14_16 +; RV32I-NEXT: .LBB14_15: +; RV32I-NEXT: sll t2, t1, s6 +; RV32I-NEXT: srl a3, t1, s10 +; RV32I-NEXT: or a3, a3, t4 +; RV32I-NEXT: .LBB14_16: +; RV32I-NEXT: addi s9, a4, -64 +; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: beqz s7, .LBB14_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t4, a3 +; RV32I-NEXT: .LBB14_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB14_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: j .LBB14_21 +; RV32I-NEXT: .LBB14_20: +; RV32I-NEXT: sll a3, t5, s11 +; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: .LBB14_21: +; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: beqz s9, .LBB14_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: .LBB14_23: +; RV32I-NEXT: lbu s4, 9(a0) +; RV32I-NEXT: lbu s2, 10(a0) +; RV32I-NEXT: lbu s5, 13(a0) +; RV32I-NEXT: lbu s8, 14(a0) +; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t3, .LBB14_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: j .LBB14_26 +; RV32I-NEXT: .LBB14_25: +; RV32I-NEXT: srl s1, t5, a4 +; RV32I-NEXT: .LBB14_26: +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: lbu ra, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: bgeu a4, t6, .LBB14_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or s0, a7, t2 +; RV32I-NEXT: or s1, a5, t4 +; RV32I-NEXT: .LBB14_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: slli t4, s2, 16 +; RV32I-NEXT: or s2, s5, s3 +; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: mv s4, t0 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: beqz a4, .LBB14_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv s4, s0 +; RV32I-NEXT: mv a7, s1 +; RV32I-NEXT: .LBB14_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s1, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: or t4, t4, a5 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: bltu a4, t6, .LBB14_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB14_32: +; RV32I-NEXT: slli s3, ra, 8 +; RV32I-NEXT: or a5, s5, a3 +; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: srl s2, t4, a4 +; RV32I-NEXT: sll ra, t2, s6 +; RV32I-NEXT: bltu a4, t3, .LBB14_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s0, t2, a4 +; RV32I-NEXT: j .LBB14_35 +; RV32I-NEXT: .LBB14_34: +; RV32I-NEXT: or s0, s2, ra +; RV32I-NEXT: .LBB14_35: +; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a4, .LBB14_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s0 +; RV32I-NEXT: .LBB14_37: +; RV32I-NEXT: or s0, a5, s3 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: bltu a4, t3, .LBB14_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: srl a3, a0, a4 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: bnez a4, .LBB14_40 +; RV32I-NEXT: j .LBB14_41 +; RV32I-NEXT: .LBB14_39: +; RV32I-NEXT: srl s8, t2, a4 +; RV32I-NEXT: srl a3, s0, a4 +; RV32I-NEXT: sll a5, a0, s6 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: beqz a4, .LBB14_41 +; RV32I-NEXT: .LBB14_40: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB14_41: +; RV32I-NEXT: bltu a4, t3, .LBB14_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t3, .LBB14_45 +; RV32I-NEXT: .LBB14_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: bnez s7, .LBB14_46 +; RV32I-NEXT: j .LBB14_47 +; RV32I-NEXT: .LBB14_44: +; RV32I-NEXT: srl s1, a0, a4 +; RV32I-NEXT: bltu s7, t3, .LBB14_43 +; RV32I-NEXT: .LBB14_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t2 +; RV32I-NEXT: beqz s7, .LBB14_47 +; RV32I-NEXT: .LBB14_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB14_47: +; RV32I-NEXT: bltu s9, t3, .LBB14_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t2, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB14_50 +; RV32I-NEXT: j .LBB14_51 +; RV32I-NEXT: .LBB14_49: +; RV32I-NEXT: sll a3, t2, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB14_51 +; RV32I-NEXT: .LBB14_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB14_51: +; RV32I-NEXT: bltu s9, t3, .LBB14_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a4, t6, .LBB14_54 +; RV32I-NEXT: j .LBB14_55 +; RV32I-NEXT: .LBB14_53: +; RV32I-NEXT: srl s7, t2, a4 +; RV32I-NEXT: bgeu a4, t6, .LBB14_55 +; RV32I-NEXT: .LBB14_54: +; RV32I-NEXT: or s2, a5, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB14_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a5, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a4, .LBB14_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a5, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB14_57: +; RV32I-NEXT: sw a5, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a4 +; RV32I-NEXT: bltu a4, t6, .LBB14_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: .LBB14_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a5, t0, s3 +; RV32I-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB14_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a3, t0, s2 +; RV32I-NEXT: j .LBB14_62 +; RV32I-NEXT: .LBB14_61: +; RV32I-NEXT: sll s10, t0, s6 +; RV32I-NEXT: lw a3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: .LBB14_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a6 +; RV32I-NEXT: beqz s2, .LBB14_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB14_64: +; RV32I-NEXT: bltu s1, t3, .LBB14_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a6, s1 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez s1, .LBB14_67 +; RV32I-NEXT: j .LBB14_68 +; RV32I-NEXT: .LBB14_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a6, a3 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz s1, .LBB14_68 +; RV32I-NEXT: .LBB14_67: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB14_68: +; RV32I-NEXT: bltu s1, t3, .LBB14_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t3, .LBB14_72 +; RV32I-NEXT: .LBB14_70: +; RV32I-NEXT: sll s6, t1, s6 +; RV32I-NEXT: srl a3, t1, s3 +; RV32I-NEXT: lw s3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB14_73 +; RV32I-NEXT: .LBB14_71: +; RV32I-NEXT: srl s1, a6, s3 +; RV32I-NEXT: sw s5, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t3, .LBB14_70 +; RV32I-NEXT: .LBB14_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: .LBB14_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, t5 +; RV32I-NEXT: beqz s2, .LBB14_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB14_75: +; RV32I-NEXT: bltu s9, t3, .LBB14_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t0, s9 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: bnez s9, .LBB14_78 +; RV32I-NEXT: j .LBB14_79 +; RV32I-NEXT: .LBB14_77: +; RV32I-NEXT: sll s3, t0, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t0, a3 +; RV32I-NEXT: sll s7, a6, s2 +; RV32I-NEXT: or a3, a3, s7 +; RV32I-NEXT: mv s7, a6 +; RV32I-NEXT: beqz s9, .LBB14_79 +; RV32I-NEXT: .LBB14_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB14_79: +; RV32I-NEXT: bltu s2, t6, .LBB14_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: sw zero, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB14_82 +; RV32I-NEXT: .LBB14_81: +; RV32I-NEXT: sw s10, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s3, a5, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB14_82: +; RV32I-NEXT: addi ra, a4, -128 +; RV32I-NEXT: mv s5, t1 +; RV32I-NEXT: mv s6, t5 +; RV32I-NEXT: beqz s2, .LBB14_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s5, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB14_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, t5, s9 +; RV32I-NEXT: bltu ra, t3, .LBB14_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: srl a3, t5, ra +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: bnez ra, .LBB14_87 +; RV32I-NEXT: j .LBB14_88 +; RV32I-NEXT: .LBB14_86: +; RV32I-NEXT: lw a3, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t1 +; RV32I-NEXT: beqz ra, .LBB14_88 +; RV32I-NEXT: .LBB14_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB14_88: +; RV32I-NEXT: bltu ra, t3, .LBB14_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: srl a3, a6, ra +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: bnez ra, .LBB14_91 +; RV32I-NEXT: j .LBB14_92 +; RV32I-NEXT: .LBB14_90: +; RV32I-NEXT: srl s2, t5, a4 +; RV32I-NEXT: sll a3, a6, s9 +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz ra, .LBB14_92 +; RV32I-NEXT: .LBB14_91: +; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: .LBB14_92: +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: bltu ra, t3, .LBB14_95 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bgeu s10, t3, .LBB14_96 +; RV32I-NEXT: .LBB14_94: +; RV32I-NEXT: sll s9, t1, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB14_97 +; RV32I-NEXT: .LBB14_95: +; RV32I-NEXT: srl s7, a6, a4 +; RV32I-NEXT: bltu s10, t3, .LBB14_94 +; RV32I-NEXT: .LBB14_96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t1, s10 +; RV32I-NEXT: .LBB14_97: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, t5 +; RV32I-NEXT: beqz s10, .LBB14_99 +; RV32I-NEXT: # %bb.98: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB14_99: +; RV32I-NEXT: bltu s11, t3, .LBB14_101 +; RV32I-NEXT: # %bb.100: +; RV32I-NEXT: srl a3, t5, s11 +; RV32I-NEXT: bnez s11, .LBB14_102 +; RV32I-NEXT: j .LBB14_103 +; RV32I-NEXT: .LBB14_101: +; RV32I-NEXT: srl a3, t1, ra +; RV32I-NEXT: neg s10, s11 +; RV32I-NEXT: sll s10, t5, s10 +; RV32I-NEXT: or a3, a3, s10 +; RV32I-NEXT: beqz s11, .LBB14_103 +; RV32I-NEXT: .LBB14_102: +; RV32I-NEXT: mv t1, a3 +; RV32I-NEXT: .LBB14_103: +; RV32I-NEXT: bltu s11, t3, .LBB14_105 +; RV32I-NEXT: # %bb.104: +; RV32I-NEXT: li t3, 0 +; RV32I-NEXT: bltu ra, t6, .LBB14_106 +; RV32I-NEXT: j .LBB14_107 +; RV32I-NEXT: .LBB14_105: +; RV32I-NEXT: srl t3, t5, ra +; RV32I-NEXT: bgeu ra, t6, .LBB14_107 +; RV32I-NEXT: .LBB14_106: +; RV32I-NEXT: or t1, a5, s9 +; RV32I-NEXT: or t3, s7, s3 +; RV32I-NEXT: .LBB14_107: +; RV32I-NEXT: li a5, 128 +; RV32I-NEXT: bnez ra, .LBB14_114 +; RV32I-NEXT: # %bb.108: +; RV32I-NEXT: bgeu ra, t6, .LBB14_115 +; RV32I-NEXT: .LBB14_109: +; RV32I-NEXT: bltu a4, a5, .LBB14_116 +; RV32I-NEXT: .LBB14_110: +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bnez a4, .LBB14_117 +; RV32I-NEXT: .LBB14_111: +; RV32I-NEXT: bltu a4, a5, .LBB14_113 +; RV32I-NEXT: .LBB14_112: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: .LBB14_113: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli t0, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t3, t2, 16 +; RV32I-NEXT: srli s3, t2, 24 +; RV32I-NEXT: srli s1, s4, 16 +; RV32I-NEXT: srli a3, s4, 24 +; RV32I-NEXT: srli t6, a7, 16 +; RV32I-NEXT: srli s6, a7, 24 +; RV32I-NEXT: srli s5, a1, 16 +; RV32I-NEXT: srli s7, a1, 24 +; RV32I-NEXT: srli s8, ra, 16 +; RV32I-NEXT: srli s9, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s10, s0, t1 +; RV32I-NEXT: and s11, a0, t1 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli t0, s11, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t2, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, s4, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t2, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, a7, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s4, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a3, a1, t1 +; RV32I-NEXT: and a4, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb a1, 24(a2) +; RV32I-NEXT: sb a3, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB14_114: +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: bltu ra, t6, .LBB14_109 +; RV32I-NEXT: .LBB14_115: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: bgeu a4, a5, .LBB14_110 +; RV32I-NEXT: .LBB14_116: +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a6, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t0, a3, a6 +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a6, a3, s8 +; RV32I-NEXT: lw a3, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s5 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: lw ra, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB14_111 +; RV32I-NEXT: .LBB14_117: +; RV32I-NEXT: mv s0, t0 +; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t2, s2 +; RV32I-NEXT: bgeu a4, a5, .LBB14_112 +; RV32I-NEXT: j .LBB14_113 + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -112 +; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 12(a0) +; RV64I-NEXT: lbu s0, 13(a0) +; RV64I-NEXT: lbu s1, 14(a0) +; RV64I-NEXT: lbu s2, 15(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: or t6, t6, t5 +; RV64I-NEXT: or s0, s0, a4 +; RV64I-NEXT: or s1, s2, s1 +; RV64I-NEXT: lbu a4, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu t5, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: or t1, t3, t2 +; RV64I-NEXT: or t2, t4, a4 +; RV64I-NEXT: or a1, a1, t5 +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu t4, 21(a0) +; RV64I-NEXT: lbu a4, 22(a0) +; RV64I-NEXT: lbu t3, 23(a0) +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t6, t6, 16 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s4, s3, a3 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a6, t6, a6 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: lbu t6, 29(a0) +; RV64I-NEXT: lbu a3, 30(a0) +; RV64I-NEXT: lbu s2, 31(a0) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or s5, t1, t0 +; RV64I-NEXT: li a7, 128 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli s3, t3, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t1, a5, s4 +; RV64I-NEXT: or a5, s0, a6 +; RV64I-NEXT: or a6, a1, s5 +; RV64I-NEXT: slli a6, a6, 3 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: srl s0, t1, t3 +; RV64I-NEXT: bltu a6, t0, .LBB15_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: sll s4, t1, t2 +; RV64I-NEXT: j .LBB15_3 +; RV64I-NEXT: .LBB15_2: +; RV64I-NEXT: sll a1, t1, a6 +; RV64I-NEXT: sll s4, a5, a6 +; RV64I-NEXT: or s4, s0, s4 +; RV64I-NEXT: .LBB15_3: +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, s3, a4 +; RV64I-NEXT: lbu ra, 17(a0) +; RV64I-NEXT: lbu s11, 18(a0) +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s5, 25(a0) +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: lbu s7, 26(a0) +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: slli s10, t6, 8 +; RV64I-NEXT: or s9, s2, a3 +; RV64I-NEXT: sub a4, a7, a6 +; RV64I-NEXT: mv a3, a5 +; RV64I-NEXT: beqz a6, .LBB15_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a3, s4 +; RV64I-NEXT: .LBB15_5: +; RV64I-NEXT: slli t6, ra, 8 +; RV64I-NEXT: or t5, t5, s11 +; RV64I-NEXT: or t4, t4, s8 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: lbu s8, 16(a0) +; RV64I-NEXT: lbu a0, 24(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: or s2, s1, s7 +; RV64I-NEXT: or s1, s10, s6 +; RV64I-NEXT: slli s4, s9, 16 +; RV64I-NEXT: bltu a4, t0, .LBB15_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: srl s0, a5, s0 +; RV64I-NEXT: j .LBB15_8 +; RV64I-NEXT: .LBB15_7: +; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: sll s6, a5, s6 +; RV64I-NEXT: or s0, s0, s6 +; RV64I-NEXT: .LBB15_8: +; RV64I-NEXT: or t6, t6, s8 +; RV64I-NEXT: slli s6, t5, 16 +; RV64I-NEXT: or s3, s3, t4 +; RV64I-NEXT: or t5, s5, a0 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: mv t4, t1 +; RV64I-NEXT: beqz a4, .LBB15_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t4, s0 +; RV64I-NEXT: .LBB15_10: +; RV64I-NEXT: or a0, s6, t6 +; RV64I-NEXT: slli s0, s3, 32 +; RV64I-NEXT: or t6, s2, t5 +; RV64I-NEXT: slli s1, s1, 32 +; RV64I-NEXT: bltu a4, t0, .LBB15_12 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: j .LBB15_13 +; RV64I-NEXT: .LBB15_12: +; RV64I-NEXT: srl t5, a5, t3 +; RV64I-NEXT: .LBB15_13: +; RV64I-NEXT: or a4, s0, a0 +; RV64I-NEXT: or a0, s1, t6 +; RV64I-NEXT: bltu a6, t0, .LBB15_15 +; RV64I-NEXT: # %bb.14: +; RV64I-NEXT: li t6, 0 +; RV64I-NEXT: sll t2, a4, t2 +; RV64I-NEXT: j .LBB15_16 +; RV64I-NEXT: .LBB15_15: +; RV64I-NEXT: sll t6, a4, a6 +; RV64I-NEXT: srl t2, a4, t3 +; RV64I-NEXT: sll t3, a0, a6 +; RV64I-NEXT: or t2, t2, t3 +; RV64I-NEXT: .LBB15_16: +; RV64I-NEXT: sub s0, a6, a7 +; RV64I-NEXT: mv t3, a0 +; RV64I-NEXT: beqz a6, .LBB15_18 +; RV64I-NEXT: # %bb.17: +; RV64I-NEXT: mv t3, t2 +; RV64I-NEXT: .LBB15_18: +; RV64I-NEXT: bltu s0, t0, .LBB15_20 +; RV64I-NEXT: # %bb.19: +; RV64I-NEXT: li t2, 0 +; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: sll t0, t1, t0 +; RV64I-NEXT: bnez s0, .LBB15_21 +; RV64I-NEXT: j .LBB15_22 +; RV64I-NEXT: .LBB15_20: +; RV64I-NEXT: sll t2, t1, s0 +; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: srl t0, t1, t0 +; RV64I-NEXT: sll t1, a5, s0 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: beqz s0, .LBB15_22 +; RV64I-NEXT: .LBB15_21: +; RV64I-NEXT: mv a5, t0 +; RV64I-NEXT: .LBB15_22: +; RV64I-NEXT: bltu a6, a7, .LBB15_24 +; RV64I-NEXT: # %bb.23: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a3, 0 +; RV64I-NEXT: bnez a6, .LBB15_25 +; RV64I-NEXT: j .LBB15_26 +; RV64I-NEXT: .LBB15_24: +; RV64I-NEXT: or t2, t4, t6 +; RV64I-NEXT: or a5, t5, t3 +; RV64I-NEXT: beqz a6, .LBB15_26 +; RV64I-NEXT: .LBB15_25: +; RV64I-NEXT: mv a4, t2 +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB15_26: +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srliw a6, a1, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a1, 24 +; RV64I-NEXT: srli t0, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a4, 32 +; RV64I-NEXT: srliw s2, a4, 16 +; RV64I-NEXT: srliw s6, a4, 24 +; RV64I-NEXT: srli s4, a4, 48 +; RV64I-NEXT: srli s7, a4, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a1, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a1, a5, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a1, a3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a1, a7, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a1, a4, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 112 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_32bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu t0, 4(a0) +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: lbu a6, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: li s9, 64 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t3 +; RV32I-NEXT: li t4, 32 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or t3, a5, a4 +; RV32I-NEXT: or a5, t2, a7 +; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: slli a4, a4, 3 +; RV32I-NEXT: neg s10, a4 +; RV32I-NEXT: srl t5, t3, s10 +; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: bltu a4, t4, .LBB15_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: j .LBB15_3 +; RV32I-NEXT: .LBB15_2: +; RV32I-NEXT: sll s8, t3, a4 +; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: .LBB15_3: +; RV32I-NEXT: lbu t2, 9(a0) +; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t6, a3, 8 +; RV32I-NEXT: sub s6, s9, a4 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: beqz a4, .LBB15_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: .LBB15_5: +; RV32I-NEXT: slli a7, t2, 8 +; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: neg t6, s6 +; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s6, t4, .LBB15_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t6, a5, s6 +; RV32I-NEXT: j .LBB15_8 +; RV32I-NEXT: .LBB15_7: +; RV32I-NEXT: sll t6, a5, t6 +; RV32I-NEXT: or t6, t5, t6 +; RV32I-NEXT: .LBB15_8: +; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: slli t2, a6, 16 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: beqz s6, .LBB15_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a6, t6 +; RV32I-NEXT: .LBB15_10: +; RV32I-NEXT: or t1, t2, a7 +; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: bltu s6, t4, .LBB15_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: j .LBB15_13 +; RV32I-NEXT: .LBB15_12: +; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: .LBB15_13: +; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: sll a1, t2, a4 +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB15_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, a4 +; RV32I-NEXT: j .LBB15_16 +; RV32I-NEXT: .LBB15_15: +; RV32I-NEXT: sll s1, t1, a4 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB15_16: +; RV32I-NEXT: addi s7, a4, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz a4, .LBB15_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB15_18: +; RV32I-NEXT: neg a1, s7 +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t4, .LBB15_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: sll a1, t3, s7 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: bnez s7, .LBB15_21 +; RV32I-NEXT: j .LBB15_22 +; RV32I-NEXT: .LBB15_20: +; RV32I-NEXT: sll s2, t3, a4 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: beqz s7, .LBB15_22 +; RV32I-NEXT: .LBB15_21: +; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: .LBB15_22: +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bltu a4, s9, .LBB15_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: j .LBB15_25 +; RV32I-NEXT: .LBB15_24: +; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, a6, s1 +; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: .LBB15_25: +; RV32I-NEXT: sub ra, a1, a4 +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: mv a6, t2 +; RV32I-NEXT: beqz a4, .LBB15_27 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: mv a7, s2 +; RV32I-NEXT: mv a6, s4 +; RV32I-NEXT: .LBB15_27: +; RV32I-NEXT: neg s1, ra +; RV32I-NEXT: sll s2, t2, s1 +; RV32I-NEXT: bltu ra, t4, .LBB15_29 +; RV32I-NEXT: # %bb.28: +; RV32I-NEXT: srl a1, t2, ra +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bnez ra, .LBB15_30 +; RV32I-NEXT: j .LBB15_31 +; RV32I-NEXT: .LBB15_29: +; RV32I-NEXT: or a1, s0, s2 +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: beqz ra, .LBB15_31 +; RV32I-NEXT: .LBB15_30: +; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB15_31: +; RV32I-NEXT: bltu ra, t4, .LBB15_33 +; RV32I-NEXT: # %bb.32: +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: srl a1, a5, ra +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: bnez ra, .LBB15_34 +; RV32I-NEXT: j .LBB15_35 +; RV32I-NEXT: .LBB15_33: +; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, a5, s1 +; RV32I-NEXT: or a1, t5, a1 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: beqz ra, .LBB15_35 +; RV32I-NEXT: .LBB15_34: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB15_35: +; RV32I-NEXT: sub s3, s9, ra +; RV32I-NEXT: bltu ra, t4, .LBB15_38 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s3, t4, .LBB15_39 +; RV32I-NEXT: .LBB15_37: +; RV32I-NEXT: sll s1, t1, s1 +; RV32I-NEXT: neg a1, s3 +; RV32I-NEXT: srl a1, t1, a1 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: j .LBB15_40 +; RV32I-NEXT: .LBB15_38: +; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s3, t4, .LBB15_37 +; RV32I-NEXT: .LBB15_39: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, s3 +; RV32I-NEXT: .LBB15_40: +; RV32I-NEXT: addi s4, ra, -64 +; RV32I-NEXT: mv s2, t2 +; RV32I-NEXT: beqz s3, .LBB15_42 +; RV32I-NEXT: # %bb.41: +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: .LBB15_42: +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: bltu s4, t4, .LBB15_44 +; RV32I-NEXT: # %bb.43: +; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: j .LBB15_45 +; RV32I-NEXT: .LBB15_44: +; RV32I-NEXT: srl a1, t1, ra +; RV32I-NEXT: neg t0, s4 +; RV32I-NEXT: sll t0, t2, t0 +; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: .LBB15_45: +; RV32I-NEXT: mv s0, s10 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: lbu a1, 23(a0) +; RV32I-NEXT: mv s3, t1 +; RV32I-NEXT: beqz s4, .LBB15_47 +; RV32I-NEXT: # %bb.46: +; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: .LBB15_47: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: lbu s10, 17(a0) +; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu t6, 22(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: li a3, 64 +; RV32I-NEXT: bltu s4, t4, .LBB15_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB15_50 +; RV32I-NEXT: .LBB15_49: +; RV32I-NEXT: srl s4, t2, ra +; RV32I-NEXT: .LBB15_50: +; RV32I-NEXT: or s11, s8, t0 +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t6, a1, t6 +; RV32I-NEXT: bgeu ra, a3, .LBB15_52 +; RV32I-NEXT: # %bb.51: +; RV32I-NEXT: or s3, t5, s1 +; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s4, a1, s2 +; RV32I-NEXT: .LBB15_52: +; RV32I-NEXT: or a1, s10, t0 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: or t0, s9, s8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: mv s1, a5 +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: beqz ra, .LBB15_54 +; RV32I-NEXT: # %bb.53: +; RV32I-NEXT: mv t5, s3 +; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: .LBB15_54: +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, s11, a1 +; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: li a1, 64 +; RV32I-NEXT: mv a6, a7 +; RV32I-NEXT: mv a7, s0 +; RV32I-NEXT: bltu ra, a1, .LBB15_56 +; RV32I-NEXT: # %bb.55: +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB15_56: +; RV32I-NEXT: srl s3, s2, a7 +; RV32I-NEXT: sll ra, s1, a4 +; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB15_58 +; RV32I-NEXT: # %bb.57: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: j .LBB15_59 +; RV32I-NEXT: .LBB15_58: +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: .LBB15_59: +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu t6, 31(a0) +; RV32I-NEXT: mv t5, s1 +; RV32I-NEXT: beqz a4, .LBB15_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB15_61: +; RV32I-NEXT: lbu s8, 25(a0) +; RV32I-NEXT: lbu s4, 26(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: bltu s6, t4, .LBB15_63 +; RV32I-NEXT: # %bb.62: +; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: j .LBB15_64 +; RV32I-NEXT: .LBB15_63: +; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sll a1, s1, a1 +; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: .LBB15_64: +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s3, 24(a0) +; RV32I-NEXT: lbu a1, 28(a0) +; RV32I-NEXT: or s4, s9, s4 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: mv s9, s2 +; RV32I-NEXT: beqz s6, .LBB15_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: .LBB15_66: +; RV32I-NEXT: or a0, s8, s3 +; RV32I-NEXT: slli t0, s4, 16 +; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: bltu s6, t4, .LBB15_68 +; RV32I-NEXT: # %bb.67: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB15_69 +; RV32I-NEXT: .LBB15_68: +; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: .LBB15_69: +; RV32I-NEXT: li s11, 64 +; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or a0, t6, a1 +; RV32I-NEXT: bltu a4, t4, .LBB15_71 +; RV32I-NEXT: # %bb.70: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a1, s6, a4 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: bnez a4, .LBB15_72 +; RV32I-NEXT: j .LBB15_73 +; RV32I-NEXT: .LBB15_71: +; RV32I-NEXT: sll s3, s6, a4 +; RV32I-NEXT: srl a1, s6, s0 +; RV32I-NEXT: sll t0, a0, a4 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: beqz a4, .LBB15_73 +; RV32I-NEXT: .LBB15_72: +; RV32I-NEXT: mv s10, a1 +; RV32I-NEXT: .LBB15_73: +; RV32I-NEXT: bltu s7, t4, .LBB15_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, s2, s7 +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: bnez s7, .LBB15_76 +; RV32I-NEXT: j .LBB15_77 +; RV32I-NEXT: .LBB15_75: +; RV32I-NEXT: sll s5, s2, a4 +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: srl a1, s2, a1 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: beqz s7, .LBB15_77 +; RV32I-NEXT: .LBB15_76: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB15_77: +; RV32I-NEXT: bltu a4, s11, .LBB15_79 +; RV32I-NEXT: # %bb.78: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: j .LBB15_80 +; RV32I-NEXT: .LBB15_79: +; RV32I-NEXT: or s5, s9, s3 +; RV32I-NEXT: or s0, s4, s10 +; RV32I-NEXT: .LBB15_80: +; RV32I-NEXT: addi s9, a4, -128 +; RV32I-NEXT: mv s7, s6 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: beqz a4, .LBB15_82 +; RV32I-NEXT: # %bb.81: +; RV32I-NEXT: mv s7, s5 +; RV32I-NEXT: mv s8, s0 +; RV32I-NEXT: .LBB15_82: +; RV32I-NEXT: neg s3, s9 +; RV32I-NEXT: srl s0, t3, s3 +; RV32I-NEXT: bltu s9, t4, .LBB15_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, t3, s9 +; RV32I-NEXT: j .LBB15_85 +; RV32I-NEXT: .LBB15_84: +; RV32I-NEXT: sll s5, t3, a4 +; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB15_85: +; RV32I-NEXT: sub s4, s11, s9 +; RV32I-NEXT: mv t6, a5 +; RV32I-NEXT: beqz s9, .LBB15_87 +; RV32I-NEXT: # %bb.86: +; RV32I-NEXT: mv t6, a1 +; RV32I-NEXT: .LBB15_87: +; RV32I-NEXT: bltu s4, t4, .LBB15_89 +; RV32I-NEXT: # %bb.88: +; RV32I-NEXT: srl a1, a5, s4 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: bnez s4, .LBB15_90 +; RV32I-NEXT: j .LBB15_91 +; RV32I-NEXT: .LBB15_89: +; RV32I-NEXT: neg a1, s4 +; RV32I-NEXT: sll a1, a5, a1 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: beqz s4, .LBB15_91 +; RV32I-NEXT: .LBB15_90: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB15_91: +; RV32I-NEXT: bltu s4, t4, .LBB15_94 +; RV32I-NEXT: # %bb.92: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bgeu s9, t4, .LBB15_95 +; RV32I-NEXT: .LBB15_93: +; RV32I-NEXT: sll s10, t1, a4 +; RV32I-NEXT: srl a1, t1, s3 +; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: j .LBB15_96 +; RV32I-NEXT: .LBB15_94: +; RV32I-NEXT: srl s4, a5, s3 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bltu s9, t4, .LBB15_93 +; RV32I-NEXT: .LBB15_95: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a1, t1, s9 +; RV32I-NEXT: .LBB15_96: +; RV32I-NEXT: addi s11, s9, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz s9, .LBB15_98 +; RV32I-NEXT: # %bb.97: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB15_98: +; RV32I-NEXT: bltu s11, t4, .LBB15_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: sll a1, t3, s11 +; RV32I-NEXT: bnez s11, .LBB15_101 +; RV32I-NEXT: j .LBB15_102 +; RV32I-NEXT: .LBB15_100: +; RV32I-NEXT: sll t4, t3, s9 +; RV32I-NEXT: neg a1, s11 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: sll t0, a5, s9 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: beqz s11, .LBB15_102 +; RV32I-NEXT: .LBB15_101: +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: .LBB15_102: +; RV32I-NEXT: bltu s9, ra, .LBB15_104 +; RV32I-NEXT: # %bb.103: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bnez s9, .LBB15_105 +; RV32I-NEXT: j .LBB15_106 +; RV32I-NEXT: .LBB15_104: +; RV32I-NEXT: or t4, s0, s10 +; RV32I-NEXT: or a5, s4, s3 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: beqz s9, .LBB15_106 +; RV32I-NEXT: .LBB15_105: +; RV32I-NEXT: mv t1, t4 +; RV32I-NEXT: mv t2, a5 +; RV32I-NEXT: .LBB15_106: +; RV32I-NEXT: bltu a4, a1, .LBB15_108 +; RV32I-NEXT: # %bb.107: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: bnez a4, .LBB15_109 +; RV32I-NEXT: j .LBB15_110 +; RV32I-NEXT: .LBB15_108: +; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s5, a1, a5 +; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t6, a1, t5 +; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a1, s7 +; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t2, a1, s8 +; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB15_110 +; RV32I-NEXT: .LBB15_109: +; RV32I-NEXT: mv s2, s5 +; RV32I-NEXT: mv s1, t6 +; RV32I-NEXT: mv s6, t1 +; RV32I-NEXT: mv a0, t2 +; RV32I-NEXT: .LBB15_110: +; RV32I-NEXT: srli a4, ra, 16 +; RV32I-NEXT: lui t2, 16 +; RV32I-NEXT: srli t1, ra, 24 +; RV32I-NEXT: srli a5, a3, 16 +; RV32I-NEXT: srli t4, a3, 24 +; RV32I-NEXT: srli t0, a7, 16 +; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: srli s3, a6, 24 +; RV32I-NEXT: srli t6, s2, 16 +; RV32I-NEXT: srli a1, s2, 24 +; RV32I-NEXT: srli t5, s1, 16 +; RV32I-NEXT: srli s5, s1, 24 +; RV32I-NEXT: srli s4, s6, 16 +; RV32I-NEXT: srli s7, s6, 24 +; RV32I-NEXT: srli s8, a0, 16 +; RV32I-NEXT: srli s9, a0, 24 +; RV32I-NEXT: addi t2, t2, -1 +; RV32I-NEXT: and s10, ra, t2 +; RV32I-NEXT: and s11, a3, t2 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb ra, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t1, 3(a2) +; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: srli t1, s11, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb t1, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t4, 7(a2) +; RV32I-NEXT: and a3, a6, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb s0, 11(a2) +; RV32I-NEXT: and a4, s2, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a6, 12(a2) +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a3, s1, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s2, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb t6, 18(a2) +; RV32I-NEXT: sb a1, 19(a2) +; RV32I-NEXT: and a1, s6, t2 +; RV32I-NEXT: and a4, a0, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s1, 20(a2) +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: sb t5, 22(a2) +; RV32I-NEXT: sb s5, 23(a2) +; RV32I-NEXT: sb s6, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s4, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb a0, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -112 +; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 12(a0) +; RV64I-NEXT: lbu s0, 13(a0) +; RV64I-NEXT: lbu s1, 14(a0) +; RV64I-NEXT: lbu s2, 15(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: or t6, t6, t5 +; RV64I-NEXT: or s0, s0, a4 +; RV64I-NEXT: or s1, s2, s1 +; RV64I-NEXT: lbu a4, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu t5, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: or t1, t3, t2 +; RV64I-NEXT: or t2, t4, a4 +; RV64I-NEXT: or a1, a1, t5 +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu t4, 21(a0) +; RV64I-NEXT: lbu a4, 22(a0) +; RV64I-NEXT: lbu t3, 23(a0) +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t6, t6, 16 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s4, s3, a3 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a6, t6, a6 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: lbu t6, 29(a0) +; RV64I-NEXT: lbu a3, 30(a0) +; RV64I-NEXT: lbu s2, 31(a0) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or s5, t1, t0 +; RV64I-NEXT: li a7, 128 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli s3, t3, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t1, a5, s4 +; RV64I-NEXT: or a5, s0, a6 +; RV64I-NEXT: or a6, a1, s5 +; RV64I-NEXT: slli a6, a6, 5 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: srl s0, t1, t3 +; RV64I-NEXT: bltu a6, t0, .LBB16_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: sll s4, t1, t2 +; RV64I-NEXT: j .LBB16_3 +; RV64I-NEXT: .LBB16_2: +; RV64I-NEXT: sll a1, t1, a6 +; RV64I-NEXT: sll s4, a5, a6 +; RV64I-NEXT: or s4, s0, s4 +; RV64I-NEXT: .LBB16_3: +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, s3, a4 +; RV64I-NEXT: lbu ra, 17(a0) +; RV64I-NEXT: lbu s11, 18(a0) +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s5, 25(a0) +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: lbu s7, 26(a0) +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: slli s10, t6, 8 +; RV64I-NEXT: or s9, s2, a3 +; RV64I-NEXT: sub a4, a7, a6 +; RV64I-NEXT: mv a3, a5 +; RV64I-NEXT: beqz a6, .LBB16_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a3, s4 +; RV64I-NEXT: .LBB16_5: +; RV64I-NEXT: slli t6, ra, 8 +; RV64I-NEXT: or t5, t5, s11 +; RV64I-NEXT: or t4, t4, s8 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: lbu s8, 16(a0) +; RV64I-NEXT: lbu a0, 24(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: or s2, s1, s7 +; RV64I-NEXT: or s1, s10, s6 +; RV64I-NEXT: slli s4, s9, 16 +; RV64I-NEXT: bltu a4, t0, .LBB16_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: srl s0, a5, s0 +; RV64I-NEXT: j .LBB16_8 +; RV64I-NEXT: .LBB16_7: +; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: sll s6, a5, s6 +; RV64I-NEXT: or s0, s0, s6 +; RV64I-NEXT: .LBB16_8: +; RV64I-NEXT: or t6, t6, s8 +; RV64I-NEXT: slli s6, t5, 16 +; RV64I-NEXT: or s3, s3, t4 +; RV64I-NEXT: or t5, s5, a0 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: mv t4, t1 +; RV64I-NEXT: beqz a4, .LBB16_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t4, s0 +; RV64I-NEXT: .LBB16_10: +; RV64I-NEXT: or a0, s6, t6 +; RV64I-NEXT: slli s0, s3, 32 +; RV64I-NEXT: or t6, s2, t5 +; RV64I-NEXT: slli s1, s1, 32 +; RV64I-NEXT: bltu a4, t0, .LBB16_12 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: j .LBB16_13 +; RV64I-NEXT: .LBB16_12: +; RV64I-NEXT: srl t5, a5, t3 +; RV64I-NEXT: .LBB16_13: +; RV64I-NEXT: or a4, s0, a0 +; RV64I-NEXT: or a0, s1, t6 +; RV64I-NEXT: bltu a6, t0, .LBB16_15 +; RV64I-NEXT: # %bb.14: +; RV64I-NEXT: li t6, 0 +; RV64I-NEXT: sll t2, a4, t2 +; RV64I-NEXT: j .LBB16_16 +; RV64I-NEXT: .LBB16_15: +; RV64I-NEXT: sll t6, a4, a6 +; RV64I-NEXT: srl t2, a4, t3 +; RV64I-NEXT: sll t3, a0, a6 +; RV64I-NEXT: or t2, t2, t3 +; RV64I-NEXT: .LBB16_16: +; RV64I-NEXT: sub s0, a6, a7 +; RV64I-NEXT: mv t3, a0 +; RV64I-NEXT: beqz a6, .LBB16_18 +; RV64I-NEXT: # %bb.17: +; RV64I-NEXT: mv t3, t2 +; RV64I-NEXT: .LBB16_18: +; RV64I-NEXT: bltu s0, t0, .LBB16_20 +; RV64I-NEXT: # %bb.19: +; RV64I-NEXT: li t2, 0 +; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: sll t0, t1, t0 +; RV64I-NEXT: bnez s0, .LBB16_21 +; RV64I-NEXT: j .LBB16_22 +; RV64I-NEXT: .LBB16_20: +; RV64I-NEXT: sll t2, t1, s0 +; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: srl t0, t1, t0 +; RV64I-NEXT: sll t1, a5, s0 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: beqz s0, .LBB16_22 +; RV64I-NEXT: .LBB16_21: +; RV64I-NEXT: mv a5, t0 +; RV64I-NEXT: .LBB16_22: +; RV64I-NEXT: bltu a6, a7, .LBB16_24 +; RV64I-NEXT: # %bb.23: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a3, 0 +; RV64I-NEXT: bnez a6, .LBB16_25 +; RV64I-NEXT: j .LBB16_26 +; RV64I-NEXT: .LBB16_24: +; RV64I-NEXT: or t2, t4, t6 +; RV64I-NEXT: or a5, t5, t3 +; RV64I-NEXT: beqz a6, .LBB16_26 +; RV64I-NEXT: .LBB16_25: +; RV64I-NEXT: mv a4, t2 +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB16_26: +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srliw a6, a1, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a1, 24 +; RV64I-NEXT: srli t0, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a4, 32 +; RV64I-NEXT: srliw s2, a4, 16 +; RV64I-NEXT: srliw s6, a4, 24 +; RV64I-NEXT: srli s4, a4, 48 +; RV64I-NEXT: srli s7, a4, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a1, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a1, a5, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a1, a3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a1, a7, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a1, a4, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 112 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu t0, 4(a0) +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: lbu a6, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: li s9, 64 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t3 +; RV32I-NEXT: li t4, 32 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or t3, a5, a4 +; RV32I-NEXT: or a5, t2, a7 +; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: slli a4, a4, 5 +; RV32I-NEXT: neg s10, a4 +; RV32I-NEXT: srl t5, t3, s10 +; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: bltu a4, t4, .LBB16_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: j .LBB16_3 +; RV32I-NEXT: .LBB16_2: +; RV32I-NEXT: sll s8, t3, a4 +; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: .LBB16_3: +; RV32I-NEXT: lbu t2, 9(a0) +; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t6, a3, 8 +; RV32I-NEXT: sub s6, s9, a4 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: beqz a4, .LBB16_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: .LBB16_5: +; RV32I-NEXT: slli a7, t2, 8 +; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: neg t6, s6 +; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s6, t4, .LBB16_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t6, a5, s6 +; RV32I-NEXT: j .LBB16_8 +; RV32I-NEXT: .LBB16_7: +; RV32I-NEXT: sll t6, a5, t6 +; RV32I-NEXT: or t6, t5, t6 +; RV32I-NEXT: .LBB16_8: +; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: slli t2, a6, 16 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: beqz s6, .LBB16_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a6, t6 +; RV32I-NEXT: .LBB16_10: +; RV32I-NEXT: or t1, t2, a7 +; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: bltu s6, t4, .LBB16_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: j .LBB16_13 +; RV32I-NEXT: .LBB16_12: +; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: .LBB16_13: +; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: sll a1, t2, a4 +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB16_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, a4 +; RV32I-NEXT: j .LBB16_16 +; RV32I-NEXT: .LBB16_15: +; RV32I-NEXT: sll s1, t1, a4 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB16_16: +; RV32I-NEXT: addi s7, a4, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz a4, .LBB16_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB16_18: +; RV32I-NEXT: neg a1, s7 +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t4, .LBB16_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: sll a1, t3, s7 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: bnez s7, .LBB16_21 +; RV32I-NEXT: j .LBB16_22 +; RV32I-NEXT: .LBB16_20: +; RV32I-NEXT: sll s2, t3, a4 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: beqz s7, .LBB16_22 +; RV32I-NEXT: .LBB16_21: +; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: .LBB16_22: +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bltu a4, s9, .LBB16_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: j .LBB16_25 +; RV32I-NEXT: .LBB16_24: +; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, a6, s1 +; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: .LBB16_25: +; RV32I-NEXT: sub ra, a1, a4 +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: mv a6, t2 +; RV32I-NEXT: beqz a4, .LBB16_27 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: mv a7, s2 +; RV32I-NEXT: mv a6, s4 +; RV32I-NEXT: .LBB16_27: +; RV32I-NEXT: neg s1, ra +; RV32I-NEXT: sll s2, t2, s1 +; RV32I-NEXT: bltu ra, t4, .LBB16_29 +; RV32I-NEXT: # %bb.28: +; RV32I-NEXT: srl a1, t2, ra +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bnez ra, .LBB16_30 +; RV32I-NEXT: j .LBB16_31 +; RV32I-NEXT: .LBB16_29: +; RV32I-NEXT: or a1, s0, s2 +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: beqz ra, .LBB16_31 +; RV32I-NEXT: .LBB16_30: +; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB16_31: +; RV32I-NEXT: bltu ra, t4, .LBB16_33 +; RV32I-NEXT: # %bb.32: +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: srl a1, a5, ra +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: bnez ra, .LBB16_34 +; RV32I-NEXT: j .LBB16_35 +; RV32I-NEXT: .LBB16_33: +; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, a5, s1 +; RV32I-NEXT: or a1, t5, a1 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: beqz ra, .LBB16_35 +; RV32I-NEXT: .LBB16_34: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB16_35: +; RV32I-NEXT: sub s3, s9, ra +; RV32I-NEXT: bltu ra, t4, .LBB16_38 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s3, t4, .LBB16_39 +; RV32I-NEXT: .LBB16_37: +; RV32I-NEXT: sll s1, t1, s1 +; RV32I-NEXT: neg a1, s3 +; RV32I-NEXT: srl a1, t1, a1 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: j .LBB16_40 +; RV32I-NEXT: .LBB16_38: +; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s3, t4, .LBB16_37 +; RV32I-NEXT: .LBB16_39: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, s3 +; RV32I-NEXT: .LBB16_40: +; RV32I-NEXT: addi s4, ra, -64 +; RV32I-NEXT: mv s2, t2 +; RV32I-NEXT: beqz s3, .LBB16_42 +; RV32I-NEXT: # %bb.41: +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: .LBB16_42: +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: bltu s4, t4, .LBB16_44 +; RV32I-NEXT: # %bb.43: +; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: j .LBB16_45 +; RV32I-NEXT: .LBB16_44: +; RV32I-NEXT: srl a1, t1, ra +; RV32I-NEXT: neg t0, s4 +; RV32I-NEXT: sll t0, t2, t0 +; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: .LBB16_45: +; RV32I-NEXT: mv s0, s10 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: lbu a1, 23(a0) +; RV32I-NEXT: mv s3, t1 +; RV32I-NEXT: beqz s4, .LBB16_47 +; RV32I-NEXT: # %bb.46: +; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: .LBB16_47: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: lbu s10, 17(a0) +; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu t6, 22(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: li a3, 64 +; RV32I-NEXT: bltu s4, t4, .LBB16_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB16_50 +; RV32I-NEXT: .LBB16_49: +; RV32I-NEXT: srl s4, t2, ra +; RV32I-NEXT: .LBB16_50: +; RV32I-NEXT: or s11, s8, t0 +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t6, a1, t6 +; RV32I-NEXT: bgeu ra, a3, .LBB16_52 +; RV32I-NEXT: # %bb.51: +; RV32I-NEXT: or s3, t5, s1 +; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s4, a1, s2 +; RV32I-NEXT: .LBB16_52: +; RV32I-NEXT: or a1, s10, t0 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: or t0, s9, s8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: mv s1, a5 +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: beqz ra, .LBB16_54 +; RV32I-NEXT: # %bb.53: +; RV32I-NEXT: mv t5, s3 +; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: .LBB16_54: +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, s11, a1 +; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: li a1, 64 +; RV32I-NEXT: mv a6, a7 +; RV32I-NEXT: mv a7, s0 +; RV32I-NEXT: bltu ra, a1, .LBB16_56 +; RV32I-NEXT: # %bb.55: +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB16_56: +; RV32I-NEXT: srl s3, s2, a7 +; RV32I-NEXT: sll ra, s1, a4 +; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB16_58 +; RV32I-NEXT: # %bb.57: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: j .LBB16_59 +; RV32I-NEXT: .LBB16_58: +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: .LBB16_59: +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu t6, 31(a0) +; RV32I-NEXT: mv t5, s1 +; RV32I-NEXT: beqz a4, .LBB16_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB16_61: +; RV32I-NEXT: lbu s8, 25(a0) +; RV32I-NEXT: lbu s4, 26(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: bltu s6, t4, .LBB16_63 +; RV32I-NEXT: # %bb.62: +; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: j .LBB16_64 +; RV32I-NEXT: .LBB16_63: +; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sll a1, s1, a1 +; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: .LBB16_64: +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s3, 24(a0) +; RV32I-NEXT: lbu a1, 28(a0) +; RV32I-NEXT: or s4, s9, s4 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: mv s9, s2 +; RV32I-NEXT: beqz s6, .LBB16_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: .LBB16_66: +; RV32I-NEXT: or a0, s8, s3 +; RV32I-NEXT: slli t0, s4, 16 +; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: bltu s6, t4, .LBB16_68 +; RV32I-NEXT: # %bb.67: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB16_69 +; RV32I-NEXT: .LBB16_68: +; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: .LBB16_69: +; RV32I-NEXT: li s11, 64 +; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or a0, t6, a1 +; RV32I-NEXT: bltu a4, t4, .LBB16_71 +; RV32I-NEXT: # %bb.70: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a1, s6, a4 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: bnez a4, .LBB16_72 +; RV32I-NEXT: j .LBB16_73 +; RV32I-NEXT: .LBB16_71: +; RV32I-NEXT: sll s3, s6, a4 +; RV32I-NEXT: srl a1, s6, s0 +; RV32I-NEXT: sll t0, a0, a4 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: beqz a4, .LBB16_73 +; RV32I-NEXT: .LBB16_72: +; RV32I-NEXT: mv s10, a1 +; RV32I-NEXT: .LBB16_73: +; RV32I-NEXT: bltu s7, t4, .LBB16_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, s2, s7 +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: bnez s7, .LBB16_76 +; RV32I-NEXT: j .LBB16_77 +; RV32I-NEXT: .LBB16_75: +; RV32I-NEXT: sll s5, s2, a4 +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: srl a1, s2, a1 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: beqz s7, .LBB16_77 +; RV32I-NEXT: .LBB16_76: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB16_77: +; RV32I-NEXT: bltu a4, s11, .LBB16_79 +; RV32I-NEXT: # %bb.78: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: j .LBB16_80 +; RV32I-NEXT: .LBB16_79: +; RV32I-NEXT: or s5, s9, s3 +; RV32I-NEXT: or s0, s4, s10 +; RV32I-NEXT: .LBB16_80: +; RV32I-NEXT: addi s9, a4, -128 +; RV32I-NEXT: mv s7, s6 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: beqz a4, .LBB16_82 +; RV32I-NEXT: # %bb.81: +; RV32I-NEXT: mv s7, s5 +; RV32I-NEXT: mv s8, s0 +; RV32I-NEXT: .LBB16_82: +; RV32I-NEXT: neg s3, s9 +; RV32I-NEXT: srl s0, t3, s3 +; RV32I-NEXT: bltu s9, t4, .LBB16_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, t3, s9 +; RV32I-NEXT: j .LBB16_85 +; RV32I-NEXT: .LBB16_84: +; RV32I-NEXT: sll s5, t3, a4 +; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB16_85: +; RV32I-NEXT: sub s4, s11, s9 +; RV32I-NEXT: mv t6, a5 +; RV32I-NEXT: beqz s9, .LBB16_87 +; RV32I-NEXT: # %bb.86: +; RV32I-NEXT: mv t6, a1 +; RV32I-NEXT: .LBB16_87: +; RV32I-NEXT: bltu s4, t4, .LBB16_89 +; RV32I-NEXT: # %bb.88: +; RV32I-NEXT: srl a1, a5, s4 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: bnez s4, .LBB16_90 +; RV32I-NEXT: j .LBB16_91 +; RV32I-NEXT: .LBB16_89: +; RV32I-NEXT: neg a1, s4 +; RV32I-NEXT: sll a1, a5, a1 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: beqz s4, .LBB16_91 +; RV32I-NEXT: .LBB16_90: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB16_91: +; RV32I-NEXT: bltu s4, t4, .LBB16_94 +; RV32I-NEXT: # %bb.92: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bgeu s9, t4, .LBB16_95 +; RV32I-NEXT: .LBB16_93: +; RV32I-NEXT: sll s10, t1, a4 +; RV32I-NEXT: srl a1, t1, s3 +; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: j .LBB16_96 +; RV32I-NEXT: .LBB16_94: +; RV32I-NEXT: srl s4, a5, s3 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bltu s9, t4, .LBB16_93 +; RV32I-NEXT: .LBB16_95: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a1, t1, s9 +; RV32I-NEXT: .LBB16_96: +; RV32I-NEXT: addi s11, s9, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz s9, .LBB16_98 +; RV32I-NEXT: # %bb.97: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB16_98: +; RV32I-NEXT: bltu s11, t4, .LBB16_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: sll a1, t3, s11 +; RV32I-NEXT: bnez s11, .LBB16_101 +; RV32I-NEXT: j .LBB16_102 +; RV32I-NEXT: .LBB16_100: +; RV32I-NEXT: sll t4, t3, s9 +; RV32I-NEXT: neg a1, s11 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: sll t0, a5, s9 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: beqz s11, .LBB16_102 +; RV32I-NEXT: .LBB16_101: +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: .LBB16_102: +; RV32I-NEXT: bltu s9, ra, .LBB16_104 +; RV32I-NEXT: # %bb.103: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bnez s9, .LBB16_105 +; RV32I-NEXT: j .LBB16_106 +; RV32I-NEXT: .LBB16_104: +; RV32I-NEXT: or t4, s0, s10 +; RV32I-NEXT: or a5, s4, s3 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: beqz s9, .LBB16_106 +; RV32I-NEXT: .LBB16_105: +; RV32I-NEXT: mv t1, t4 +; RV32I-NEXT: mv t2, a5 +; RV32I-NEXT: .LBB16_106: +; RV32I-NEXT: bltu a4, a1, .LBB16_108 +; RV32I-NEXT: # %bb.107: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: bnez a4, .LBB16_109 +; RV32I-NEXT: j .LBB16_110 +; RV32I-NEXT: .LBB16_108: +; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s5, a1, a5 +; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t6, a1, t5 +; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a1, s7 +; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t2, a1, s8 +; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB16_110 +; RV32I-NEXT: .LBB16_109: +; RV32I-NEXT: mv s2, s5 +; RV32I-NEXT: mv s1, t6 +; RV32I-NEXT: mv s6, t1 +; RV32I-NEXT: mv a0, t2 +; RV32I-NEXT: .LBB16_110: +; RV32I-NEXT: srli a4, ra, 16 +; RV32I-NEXT: lui t2, 16 +; RV32I-NEXT: srli t1, ra, 24 +; RV32I-NEXT: srli a5, a3, 16 +; RV32I-NEXT: srli t4, a3, 24 +; RV32I-NEXT: srli t0, a7, 16 +; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: srli s3, a6, 24 +; RV32I-NEXT: srli t6, s2, 16 +; RV32I-NEXT: srli a1, s2, 24 +; RV32I-NEXT: srli t5, s1, 16 +; RV32I-NEXT: srli s5, s1, 24 +; RV32I-NEXT: srli s4, s6, 16 +; RV32I-NEXT: srli s7, s6, 24 +; RV32I-NEXT: srli s8, a0, 16 +; RV32I-NEXT: srli s9, a0, 24 +; RV32I-NEXT: addi t2, t2, -1 +; RV32I-NEXT: and s10, ra, t2 +; RV32I-NEXT: and s11, a3, t2 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb ra, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t1, 3(a2) +; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: srli t1, s11, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb t1, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t4, 7(a2) +; RV32I-NEXT: and a3, a6, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb s0, 11(a2) +; RV32I-NEXT: and a4, s2, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a6, 12(a2) +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a3, s1, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s2, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb t6, 18(a2) +; RV32I-NEXT: sb a1, 19(a2) +; RV32I-NEXT: and a1, s6, t2 +; RV32I-NEXT: and a4, a0, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s1, 20(a2) +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: sb t5, 22(a2) +; RV32I-NEXT: sb s5, 23(a2) +; RV32I-NEXT: sb s6, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s4, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb a0, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -112 +; RV64I-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 12(a0) +; RV64I-NEXT: lbu s0, 13(a0) +; RV64I-NEXT: lbu s1, 14(a0) +; RV64I-NEXT: lbu s2, 15(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or a6, t4, t3 +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 1(a1) +; RV64I-NEXT: lbu t2, 2(a1) +; RV64I-NEXT: lbu t3, 3(a1) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: or t6, t6, t5 +; RV64I-NEXT: or s0, s0, a4 +; RV64I-NEXT: or s1, s2, s1 +; RV64I-NEXT: lbu a4, 4(a1) +; RV64I-NEXT: lbu t4, 5(a1) +; RV64I-NEXT: lbu t5, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: or t1, t3, t2 +; RV64I-NEXT: or t2, t4, a4 +; RV64I-NEXT: or a1, a1, t5 +; RV64I-NEXT: lbu t5, 19(a0) +; RV64I-NEXT: lbu t4, 21(a0) +; RV64I-NEXT: lbu a4, 22(a0) +; RV64I-NEXT: lbu t3, 23(a0) +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t6, t6, 16 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s4, s3, a3 +; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a6, t6, a6 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: lbu t6, 29(a0) +; RV64I-NEXT: lbu a3, 30(a0) +; RV64I-NEXT: lbu s2, 31(a0) +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or s5, t1, t0 +; RV64I-NEXT: li a7, 128 +; RV64I-NEXT: slli a1, a1, 16 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli s3, t3, 8 +; RV64I-NEXT: slli s2, s2, 8 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t1, a5, s4 +; RV64I-NEXT: or a5, s0, a6 +; RV64I-NEXT: or a6, a1, s5 +; RV64I-NEXT: slli a6, a6, 6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: srl s0, t1, t3 +; RV64I-NEXT: bltu a6, t0, .LBB17_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: sll s4, t1, t2 +; RV64I-NEXT: j .LBB17_3 +; RV64I-NEXT: .LBB17_2: +; RV64I-NEXT: sll a1, t1, a6 +; RV64I-NEXT: sll s4, a5, a6 +; RV64I-NEXT: or s4, s0, s4 +; RV64I-NEXT: .LBB17_3: +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or s3, s3, a4 +; RV64I-NEXT: lbu ra, 17(a0) +; RV64I-NEXT: lbu s11, 18(a0) +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s5, 25(a0) +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: lbu s7, 26(a0) +; RV64I-NEXT: lbu s6, 28(a0) +; RV64I-NEXT: slli s10, t6, 8 +; RV64I-NEXT: or s9, s2, a3 +; RV64I-NEXT: sub a4, a7, a6 +; RV64I-NEXT: mv a3, a5 +; RV64I-NEXT: beqz a6, .LBB17_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a3, s4 +; RV64I-NEXT: .LBB17_5: +; RV64I-NEXT: slli t6, ra, 8 +; RV64I-NEXT: or t5, t5, s11 +; RV64I-NEXT: or t4, t4, s8 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: lbu s8, 16(a0) +; RV64I-NEXT: lbu a0, 24(a0) +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: or s2, s1, s7 +; RV64I-NEXT: or s1, s10, s6 +; RV64I-NEXT: slli s4, s9, 16 +; RV64I-NEXT: bltu a4, t0, .LBB17_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: srl s0, a5, s0 +; RV64I-NEXT: j .LBB17_8 +; RV64I-NEXT: .LBB17_7: +; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: sll s6, a5, s6 +; RV64I-NEXT: or s0, s0, s6 +; RV64I-NEXT: .LBB17_8: +; RV64I-NEXT: or t6, t6, s8 +; RV64I-NEXT: slli s6, t5, 16 +; RV64I-NEXT: or s3, s3, t4 +; RV64I-NEXT: or t5, s5, a0 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: mv t4, t1 +; RV64I-NEXT: beqz a4, .LBB17_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t4, s0 +; RV64I-NEXT: .LBB17_10: +; RV64I-NEXT: or a0, s6, t6 +; RV64I-NEXT: slli s0, s3, 32 +; RV64I-NEXT: or t6, s2, t5 +; RV64I-NEXT: slli s1, s1, 32 +; RV64I-NEXT: bltu a4, t0, .LBB17_12 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: j .LBB17_13 +; RV64I-NEXT: .LBB17_12: +; RV64I-NEXT: srl t5, a5, t3 +; RV64I-NEXT: .LBB17_13: +; RV64I-NEXT: or a4, s0, a0 +; RV64I-NEXT: or a0, s1, t6 +; RV64I-NEXT: bltu a6, t0, .LBB17_15 +; RV64I-NEXT: # %bb.14: +; RV64I-NEXT: li t6, 0 +; RV64I-NEXT: sll t2, a4, t2 +; RV64I-NEXT: j .LBB17_16 +; RV64I-NEXT: .LBB17_15: +; RV64I-NEXT: sll t6, a4, a6 +; RV64I-NEXT: srl t2, a4, t3 +; RV64I-NEXT: sll t3, a0, a6 +; RV64I-NEXT: or t2, t2, t3 +; RV64I-NEXT: .LBB17_16: +; RV64I-NEXT: sub s0, a6, a7 +; RV64I-NEXT: mv t3, a0 +; RV64I-NEXT: beqz a6, .LBB17_18 +; RV64I-NEXT: # %bb.17: +; RV64I-NEXT: mv t3, t2 +; RV64I-NEXT: .LBB17_18: +; RV64I-NEXT: bltu s0, t0, .LBB17_20 +; RV64I-NEXT: # %bb.19: +; RV64I-NEXT: li t2, 0 +; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: sll t0, t1, t0 +; RV64I-NEXT: bnez s0, .LBB17_21 +; RV64I-NEXT: j .LBB17_22 +; RV64I-NEXT: .LBB17_20: +; RV64I-NEXT: sll t2, t1, s0 +; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: srl t0, t1, t0 +; RV64I-NEXT: sll t1, a5, s0 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: beqz s0, .LBB17_22 +; RV64I-NEXT: .LBB17_21: +; RV64I-NEXT: mv a5, t0 +; RV64I-NEXT: .LBB17_22: +; RV64I-NEXT: bltu a6, a7, .LBB17_24 +; RV64I-NEXT: # %bb.23: +; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a3, 0 +; RV64I-NEXT: bnez a6, .LBB17_25 +; RV64I-NEXT: j .LBB17_26 +; RV64I-NEXT: .LBB17_24: +; RV64I-NEXT: or t2, t4, t6 +; RV64I-NEXT: or a5, t5, t3 +; RV64I-NEXT: beqz a6, .LBB17_26 +; RV64I-NEXT: .LBB17_25: +; RV64I-NEXT: mv a4, t2 +; RV64I-NEXT: mv a0, a5 +; RV64I-NEXT: .LBB17_26: +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srliw a6, a1, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a1, 24 +; RV64I-NEXT: srli t0, a1, 48 +; RV64I-NEXT: srli t5, a1, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a4, 32 +; RV64I-NEXT: srliw s2, a4, 16 +; RV64I-NEXT: srliw s6, a4, 24 +; RV64I-NEXT: srli s4, a4, 48 +; RV64I-NEXT: srli s7, a4, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a1, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a1, a5, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a1, a3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a1, a7, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a1, a4, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 112 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_32bytes_dwordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu t0, 4(a0) +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: lbu a6, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t3, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: li s9, 64 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, t3 +; RV32I-NEXT: li t4, 32 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: or t3, a5, a4 +; RV32I-NEXT: or a5, t2, a7 +; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: slli a4, a4, 6 +; RV32I-NEXT: neg s10, a4 +; RV32I-NEXT: srl t5, t3, s10 +; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: bltu a4, t4, .LBB17_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: j .LBB17_3 +; RV32I-NEXT: .LBB17_2: +; RV32I-NEXT: sll s8, t3, a4 +; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: .LBB17_3: +; RV32I-NEXT: lbu t2, 9(a0) +; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t6, a3, 8 +; RV32I-NEXT: sub s6, s9, a4 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: beqz a4, .LBB17_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: .LBB17_5: +; RV32I-NEXT: slli a7, t2, 8 +; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: neg t6, s6 +; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s6, t4, .LBB17_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srl t6, a5, s6 +; RV32I-NEXT: j .LBB17_8 +; RV32I-NEXT: .LBB17_7: +; RV32I-NEXT: sll t6, a5, t6 +; RV32I-NEXT: or t6, t5, t6 +; RV32I-NEXT: .LBB17_8: +; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: slli t2, a6, 16 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: mv a6, t3 +; RV32I-NEXT: beqz s6, .LBB17_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a6, t6 +; RV32I-NEXT: .LBB17_10: +; RV32I-NEXT: or t1, t2, a7 +; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: bltu s6, t4, .LBB17_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: j .LBB17_13 +; RV32I-NEXT: .LBB17_12: +; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: .LBB17_13: +; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: sll a1, t2, a4 +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB17_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, a4 +; RV32I-NEXT: j .LBB17_16 +; RV32I-NEXT: .LBB17_15: +; RV32I-NEXT: sll s1, t1, a4 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB17_16: +; RV32I-NEXT: addi s7, a4, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz a4, .LBB17_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB17_18: +; RV32I-NEXT: neg a1, s7 +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s7, t4, .LBB17_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: li s2, 0 +; RV32I-NEXT: sll a1, t3, s7 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: bnez s7, .LBB17_21 +; RV32I-NEXT: j .LBB17_22 +; RV32I-NEXT: .LBB17_20: +; RV32I-NEXT: sll s2, t3, a4 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: mv s4, a5 +; RV32I-NEXT: beqz s7, .LBB17_22 +; RV32I-NEXT: .LBB17_21: +; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: .LBB17_22: +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bltu a4, s9, .LBB17_24 +; RV32I-NEXT: # %bb.23: +; RV32I-NEXT: sw zero, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: j .LBB17_25 +; RV32I-NEXT: .LBB17_24: +; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, a6, s1 +; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: .LBB17_25: +; RV32I-NEXT: sub ra, a1, a4 +; RV32I-NEXT: mv a7, t1 +; RV32I-NEXT: mv a6, t2 +; RV32I-NEXT: beqz a4, .LBB17_27 +; RV32I-NEXT: # %bb.26: +; RV32I-NEXT: mv a7, s2 +; RV32I-NEXT: mv a6, s4 +; RV32I-NEXT: .LBB17_27: +; RV32I-NEXT: neg s1, ra +; RV32I-NEXT: sll s2, t2, s1 +; RV32I-NEXT: bltu ra, t4, .LBB17_29 +; RV32I-NEXT: # %bb.28: +; RV32I-NEXT: srl a1, t2, ra +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: bnez ra, .LBB17_30 +; RV32I-NEXT: j .LBB17_31 +; RV32I-NEXT: .LBB17_29: +; RV32I-NEXT: or a1, s0, s2 +; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: beqz ra, .LBB17_31 +; RV32I-NEXT: .LBB17_30: +; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB17_31: +; RV32I-NEXT: bltu ra, t4, .LBB17_33 +; RV32I-NEXT: # %bb.32: +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: srl a1, a5, ra +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: bnez ra, .LBB17_34 +; RV32I-NEXT: j .LBB17_35 +; RV32I-NEXT: .LBB17_33: +; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, a5, s1 +; RV32I-NEXT: or a1, t5, a1 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: beqz ra, .LBB17_35 +; RV32I-NEXT: .LBB17_34: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB17_35: +; RV32I-NEXT: sub s3, s9, ra +; RV32I-NEXT: bltu ra, t4, .LBB17_38 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s3, t4, .LBB17_39 +; RV32I-NEXT: .LBB17_37: +; RV32I-NEXT: sll s1, t1, s1 +; RV32I-NEXT: neg a1, s3 +; RV32I-NEXT: srl a1, t1, a1 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: j .LBB17_40 +; RV32I-NEXT: .LBB17_38: +; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s3, t4, .LBB17_37 +; RV32I-NEXT: .LBB17_39: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sll a1, t1, s3 +; RV32I-NEXT: .LBB17_40: +; RV32I-NEXT: addi s4, ra, -64 +; RV32I-NEXT: mv s2, t2 +; RV32I-NEXT: beqz s3, .LBB17_42 +; RV32I-NEXT: # %bb.41: +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: .LBB17_42: +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: bltu s4, t4, .LBB17_44 +; RV32I-NEXT: # %bb.43: +; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: j .LBB17_45 +; RV32I-NEXT: .LBB17_44: +; RV32I-NEXT: srl a1, t1, ra +; RV32I-NEXT: neg t0, s4 +; RV32I-NEXT: sll t0, t2, t0 +; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: .LBB17_45: +; RV32I-NEXT: mv s0, s10 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: lbu a1, 23(a0) +; RV32I-NEXT: mv s3, t1 +; RV32I-NEXT: beqz s4, .LBB17_47 +; RV32I-NEXT: # %bb.46: +; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: .LBB17_47: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: lbu s10, 17(a0) +; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu t6, 22(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: li a3, 64 +; RV32I-NEXT: bltu s4, t4, .LBB17_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB17_50 +; RV32I-NEXT: .LBB17_49: +; RV32I-NEXT: srl s4, t2, ra +; RV32I-NEXT: .LBB17_50: +; RV32I-NEXT: or s11, s8, t0 +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t6, a1, t6 +; RV32I-NEXT: bgeu ra, a3, .LBB17_52 +; RV32I-NEXT: # %bb.51: +; RV32I-NEXT: or s3, t5, s1 +; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s4, a1, s2 +; RV32I-NEXT: .LBB17_52: +; RV32I-NEXT: or a1, s10, t0 +; RV32I-NEXT: slli s11, s11, 16 +; RV32I-NEXT: or t0, s9, s8 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: mv t5, t3 +; RV32I-NEXT: mv s1, a5 +; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: beqz ra, .LBB17_54 +; RV32I-NEXT: # %bb.53: +; RV32I-NEXT: mv t5, s3 +; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: .LBB17_54: +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: or s2, s11, a1 +; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: li a1, 64 +; RV32I-NEXT: mv a6, a7 +; RV32I-NEXT: mv a7, s0 +; RV32I-NEXT: bltu ra, a1, .LBB17_56 +; RV32I-NEXT: # %bb.55: +; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB17_56: +; RV32I-NEXT: srl s3, s2, a7 +; RV32I-NEXT: sll ra, s1, a4 +; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a4, t4, .LBB17_58 +; RV32I-NEXT: # %bb.57: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: j .LBB17_59 +; RV32I-NEXT: .LBB17_58: +; RV32I-NEXT: sll a1, s2, a4 +; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: .LBB17_59: +; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu t6, 31(a0) +; RV32I-NEXT: mv t5, s1 +; RV32I-NEXT: beqz a4, .LBB17_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: mv t5, a1 +; RV32I-NEXT: .LBB17_61: +; RV32I-NEXT: lbu s8, 25(a0) +; RV32I-NEXT: lbu s4, 26(a0) +; RV32I-NEXT: lbu s11, 29(a0) +; RV32I-NEXT: lbu s10, 30(a0) +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: bltu s6, t4, .LBB17_63 +; RV32I-NEXT: # %bb.62: +; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: j .LBB17_64 +; RV32I-NEXT: .LBB17_63: +; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sll a1, s1, a1 +; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: .LBB17_64: +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s3, 24(a0) +; RV32I-NEXT: lbu a1, 28(a0) +; RV32I-NEXT: or s4, s9, s4 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: mv s9, s2 +; RV32I-NEXT: beqz s6, .LBB17_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: .LBB17_66: +; RV32I-NEXT: or a0, s8, s3 +; RV32I-NEXT: slli t0, s4, 16 +; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: bltu s6, t4, .LBB17_68 +; RV32I-NEXT: # %bb.67: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: j .LBB17_69 +; RV32I-NEXT: .LBB17_68: +; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: .LBB17_69: +; RV32I-NEXT: li s11, 64 +; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or a0, t6, a1 +; RV32I-NEXT: bltu a4, t4, .LBB17_71 +; RV32I-NEXT: # %bb.70: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a1, s6, a4 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: bnez a4, .LBB17_72 +; RV32I-NEXT: j .LBB17_73 +; RV32I-NEXT: .LBB17_71: +; RV32I-NEXT: sll s3, s6, a4 +; RV32I-NEXT: srl a1, s6, s0 +; RV32I-NEXT: sll t0, a0, a4 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: mv s10, a0 +; RV32I-NEXT: beqz a4, .LBB17_73 +; RV32I-NEXT: .LBB17_72: +; RV32I-NEXT: mv s10, a1 +; RV32I-NEXT: .LBB17_73: +; RV32I-NEXT: bltu s7, t4, .LBB17_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, s2, s7 +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: bnez s7, .LBB17_76 +; RV32I-NEXT: j .LBB17_77 +; RV32I-NEXT: .LBB17_75: +; RV32I-NEXT: sll s5, s2, a4 +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: srl a1, s2, a1 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: mv s0, s1 +; RV32I-NEXT: beqz s7, .LBB17_77 +; RV32I-NEXT: .LBB17_76: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB17_77: +; RV32I-NEXT: bltu a4, s11, .LBB17_79 +; RV32I-NEXT: # %bb.78: +; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: li t5, 0 +; RV32I-NEXT: j .LBB17_80 +; RV32I-NEXT: .LBB17_79: +; RV32I-NEXT: or s5, s9, s3 +; RV32I-NEXT: or s0, s4, s10 +; RV32I-NEXT: .LBB17_80: +; RV32I-NEXT: addi s9, a4, -128 +; RV32I-NEXT: mv s7, s6 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: beqz a4, .LBB17_82 +; RV32I-NEXT: # %bb.81: +; RV32I-NEXT: mv s7, s5 +; RV32I-NEXT: mv s8, s0 +; RV32I-NEXT: .LBB17_82: +; RV32I-NEXT: neg s3, s9 +; RV32I-NEXT: srl s0, t3, s3 +; RV32I-NEXT: bltu s9, t4, .LBB17_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: sll a1, t3, s9 +; RV32I-NEXT: j .LBB17_85 +; RV32I-NEXT: .LBB17_84: +; RV32I-NEXT: sll s5, t3, a4 +; RV32I-NEXT: lw a1, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: .LBB17_85: +; RV32I-NEXT: sub s4, s11, s9 +; RV32I-NEXT: mv t6, a5 +; RV32I-NEXT: beqz s9, .LBB17_87 +; RV32I-NEXT: # %bb.86: +; RV32I-NEXT: mv t6, a1 +; RV32I-NEXT: .LBB17_87: +; RV32I-NEXT: bltu s4, t4, .LBB17_89 +; RV32I-NEXT: # %bb.88: +; RV32I-NEXT: srl a1, a5, s4 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: bnez s4, .LBB17_90 +; RV32I-NEXT: j .LBB17_91 +; RV32I-NEXT: .LBB17_89: +; RV32I-NEXT: neg a1, s4 +; RV32I-NEXT: sll a1, a5, a1 +; RV32I-NEXT: or a1, s0, a1 +; RV32I-NEXT: mv s0, t3 +; RV32I-NEXT: beqz s4, .LBB17_91 +; RV32I-NEXT: .LBB17_90: +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: .LBB17_91: +; RV32I-NEXT: bltu s4, t4, .LBB17_94 +; RV32I-NEXT: # %bb.92: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bgeu s9, t4, .LBB17_95 +; RV32I-NEXT: .LBB17_93: +; RV32I-NEXT: sll s10, t1, a4 +; RV32I-NEXT: srl a1, t1, s3 +; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: j .LBB17_96 +; RV32I-NEXT: .LBB17_94: +; RV32I-NEXT: srl s4, a5, s3 +; RV32I-NEXT: li ra, 64 +; RV32I-NEXT: bltu s9, t4, .LBB17_93 +; RV32I-NEXT: .LBB17_95: +; RV32I-NEXT: li s10, 0 +; RV32I-NEXT: sll a1, t1, s9 +; RV32I-NEXT: .LBB17_96: +; RV32I-NEXT: addi s11, s9, -64 +; RV32I-NEXT: mv s3, t2 +; RV32I-NEXT: beqz s9, .LBB17_98 +; RV32I-NEXT: # %bb.97: +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: .LBB17_98: +; RV32I-NEXT: bltu s11, t4, .LBB17_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: li t4, 0 +; RV32I-NEXT: sll a1, t3, s11 +; RV32I-NEXT: bnez s11, .LBB17_101 +; RV32I-NEXT: j .LBB17_102 +; RV32I-NEXT: .LBB17_100: +; RV32I-NEXT: sll t4, t3, s9 +; RV32I-NEXT: neg a1, s11 +; RV32I-NEXT: srl a1, t3, a1 +; RV32I-NEXT: sll t0, a5, s9 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: beqz s11, .LBB17_102 +; RV32I-NEXT: .LBB17_101: +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: .LBB17_102: +; RV32I-NEXT: bltu s9, ra, .LBB17_104 +; RV32I-NEXT: # %bb.103: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li t6, 0 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: bnez s9, .LBB17_105 +; RV32I-NEXT: j .LBB17_106 +; RV32I-NEXT: .LBB17_104: +; RV32I-NEXT: or t4, s0, s10 +; RV32I-NEXT: or a5, s4, s3 +; RV32I-NEXT: li a1, 128 +; RV32I-NEXT: beqz s9, .LBB17_106 +; RV32I-NEXT: .LBB17_105: +; RV32I-NEXT: mv t1, t4 +; RV32I-NEXT: mv t2, a5 +; RV32I-NEXT: .LBB17_106: +; RV32I-NEXT: bltu a4, a1, .LBB17_108 +; RV32I-NEXT: # %bb.107: +; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: bnez a4, .LBB17_109 +; RV32I-NEXT: j .LBB17_110 +; RV32I-NEXT: .LBB17_108: +; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a5, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s5, a1, a5 +; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t6, a1, t5 +; RV32I-NEXT: lw a1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a1, s7 +; RV32I-NEXT: lw a1, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t2, a1, s8 +; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: beqz a4, .LBB17_110 +; RV32I-NEXT: .LBB17_109: +; RV32I-NEXT: mv s2, s5 +; RV32I-NEXT: mv s1, t6 +; RV32I-NEXT: mv s6, t1 +; RV32I-NEXT: mv a0, t2 +; RV32I-NEXT: .LBB17_110: +; RV32I-NEXT: srli a4, ra, 16 +; RV32I-NEXT: lui t2, 16 +; RV32I-NEXT: srli t1, ra, 24 +; RV32I-NEXT: srli a5, a3, 16 +; RV32I-NEXT: srli t4, a3, 24 +; RV32I-NEXT: srli t0, a7, 16 +; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: srli s3, a6, 24 +; RV32I-NEXT: srli t6, s2, 16 +; RV32I-NEXT: srli a1, s2, 24 +; RV32I-NEXT: srli t5, s1, 16 +; RV32I-NEXT: srli s5, s1, 24 +; RV32I-NEXT: srli s4, s6, 16 +; RV32I-NEXT: srli s7, s6, 24 +; RV32I-NEXT: srli s8, a0, 16 +; RV32I-NEXT: srli s9, a0, 24 +; RV32I-NEXT: addi t2, t2, -1 +; RV32I-NEXT: and s10, ra, t2 +; RV32I-NEXT: and s11, a3, t2 +; RV32I-NEXT: srli s10, s10, 8 +; RV32I-NEXT: sb ra, 0(a2) +; RV32I-NEXT: sb s10, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb t1, 3(a2) +; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: srli t1, s11, 8 +; RV32I-NEXT: sb a3, 4(a2) +; RV32I-NEXT: sb t1, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t4, 7(a2) +; RV32I-NEXT: and a3, a6, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb s0, 11(a2) +; RV32I-NEXT: and a4, s2, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a6, 12(a2) +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: sb t3, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a3, s1, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s2, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb t6, 18(a2) +; RV32I-NEXT: sb a1, 19(a2) +; RV32I-NEXT: and a1, s6, t2 +; RV32I-NEXT: and a4, a0, t2 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb s1, 20(a2) +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: sb t5, 22(a2) +; RV32I-NEXT: sb s5, 23(a2) +; RV32I-NEXT: sb s6, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s4, 26(a2) +; RV32I-NEXT: sb s7, 27(a2) +; RV32I-NEXT: sb a0, 28(a2) +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb s8, 30(a2) +; RV32I-NEXT: sb s9, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli t3, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or a5, t5, t4 +; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a6, a6, 3 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sll t4, a5, t5 +; RV64I-NEXT: bltu a6, t0, .LBB18_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sra t6, a5, t1 +; RV64I-NEXT: j .LBB18_3 +; RV64I-NEXT: .LBB18_2: +; RV64I-NEXT: srl t6, a7, a6 +; RV64I-NEXT: or t6, t6, t4 +; RV64I-NEXT: .LBB18_3: +; RV64I-NEXT: or a3, t3, a3 +; RV64I-NEXT: slli t3, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a7 +; RV64I-NEXT: beqz a6, .LBB18_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB18_5: +; RV64I-NEXT: or a4, t3, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a6, t0, .LBB18_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: srai a0, a5, 63 +; RV64I-NEXT: srl t3, a3, t1 +; RV64I-NEXT: j .LBB18_8 +; RV64I-NEXT: .LBB18_7: +; RV64I-NEXT: sra a0, a5, a6 +; RV64I-NEXT: srl t1, a4, a6 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t3, t1, t2 +; RV64I-NEXT: .LBB18_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a6, .LBB18_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t3 +; RV64I-NEXT: .LBB18_10: +; RV64I-NEXT: sub t6, t1, a6 +; RV64I-NEXT: bltu a6, t0, .LBB18_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t3, 0 +; RV64I-NEXT: bgeu t6, t0, .LBB18_14 +; RV64I-NEXT: .LBB18_12: +; RV64I-NEXT: sll t5, a7, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a7, s0 +; RV64I-NEXT: or s1, s0, t4 +; RV64I-NEXT: j .LBB18_15 +; RV64I-NEXT: .LBB18_13: +; RV64I-NEXT: srl t3, a3, a6 +; RV64I-NEXT: bltu t6, t0, .LBB18_12 +; RV64I-NEXT: .LBB18_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: sll s1, a7, t4 +; RV64I-NEXT: .LBB18_15: +; RV64I-NEXT: sub s0, a6, t1 +; RV64I-NEXT: mv t4, a5 +; RV64I-NEXT: beqz t6, .LBB18_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t4, s1 +; RV64I-NEXT: .LBB18_17: +; RV64I-NEXT: bltu s0, t0, .LBB18_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: sra t6, a5, t6 +; RV64I-NEXT: bnez s0, .LBB18_20 +; RV64I-NEXT: j .LBB18_21 +; RV64I-NEXT: .LBB18_19: +; RV64I-NEXT: srl t6, a7, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, a5, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB18_21 +; RV64I-NEXT: .LBB18_20: +; RV64I-NEXT: mv a7, t6 +; RV64I-NEXT: .LBB18_21: +; RV64I-NEXT: bltu s0, t0, .LBB18_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: srai t0, a5, 63 +; RV64I-NEXT: bltu a6, t1, .LBB18_24 +; RV64I-NEXT: j .LBB18_25 +; RV64I-NEXT: .LBB18_23: +; RV64I-NEXT: sra t0, a5, s0 +; RV64I-NEXT: bgeu a6, t1, .LBB18_25 +; RV64I-NEXT: .LBB18_24: +; RV64I-NEXT: or a7, t2, t5 +; RV64I-NEXT: or t0, t3, t4 +; RV64I-NEXT: .LBB18_25: +; RV64I-NEXT: bnez a6, .LBB18_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a6, t1, .LBB18_28 +; RV64I-NEXT: .LBB18_27: +; RV64I-NEXT: srai a1, a5, 63 +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB18_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB18_29: +; RV64I-NEXT: mv a4, a7 +; RV64I-NEXT: mv a3, t0 +; RV64I-NEXT: bgeu a6, t1, .LBB18_27 +; RV64I-NEXT: j .LBB18_28 +; +; RV32I-LABEL: ashr_32bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t5, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t5, 32 +; RV32I-NEXT: slli a7, a4, 16 +; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli a5, t2, 16 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or a4, t1, t0 +; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli a5, a5, 3 +; RV32I-NEXT: srl s0, t2, a5 +; RV32I-NEXT: neg s6, a5 +; RV32I-NEXT: sll s1, a4, s6 +; RV32I-NEXT: bltu a5, t5, .LBB18_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra t0, a4, a5 +; RV32I-NEXT: j .LBB18_3 +; RV32I-NEXT: .LBB18_2: +; RV32I-NEXT: or t0, s0, s1 +; RV32I-NEXT: .LBB18_3: +; RV32I-NEXT: or t1, a7, a3 +; RV32I-NEXT: or a7, a1, a6 +; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: beqz a5, .LBB18_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv t3, t0 +; RV32I-NEXT: .LBB18_5: +; RV32I-NEXT: srl a3, t1, a5 +; RV32I-NEXT: sll a1, a7, s6 +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a5, t5, .LBB18_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srai t4, a4, 31 +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: j .LBB18_8 +; RV32I-NEXT: .LBB18_7: +; RV32I-NEXT: sra t4, a4, a5 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: .LBB18_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: beqz a5, .LBB18_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: .LBB18_10: +; RV32I-NEXT: sub s7, t6, a5 +; RV32I-NEXT: bltu a5, t5, .LBB18_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB18_13 +; RV32I-NEXT: .LBB18_12: +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: .LBB18_13: +; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: bltu s7, t5, .LBB18_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll a3, t2, s7 +; RV32I-NEXT: j .LBB18_16 +; RV32I-NEXT: .LBB18_15: +; RV32I-NEXT: sll a6, t2, s6 +; RV32I-NEXT: srl a3, t2, s10 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: .LBB18_16: +; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi s9, a5, -64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz s7, .LBB18_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t3, a3 +; RV32I-NEXT: .LBB18_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t5, .LBB18_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: sra s0, a4, s9 +; RV32I-NEXT: j .LBB18_21 +; RV32I-NEXT: .LBB18_20: +; RV32I-NEXT: sll a3, a4, s11 +; RV32I-NEXT: or s0, s0, a3 +; RV32I-NEXT: .LBB18_21: +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: beqz s9, .LBB18_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv t4, s0 +; RV32I-NEXT: .LBB18_23: +; RV32I-NEXT: lbu s2, 9(a0) +; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: bltu s9, t5, .LBB18_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: srai s0, a4, 31 +; RV32I-NEXT: j .LBB18_26 +; RV32I-NEXT: .LBB18_25: +; RV32I-NEXT: sra s0, a4, a5 +; RV32I-NEXT: .LBB18_26: +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s8, 8 +; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: bgeu a5, t6, .LBB18_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or t4, t0, a6 +; RV32I-NEXT: or s0, a1, t3 +; RV32I-NEXT: .LBB18_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: or a6, s2, s5 +; RV32I-NEXT: slli s2, s1, 16 +; RV32I-NEXT: or s1, s4, s3 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: beqz a5, .LBB18_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv a1, t4 +; RV32I-NEXT: mv t0, s0 +; RV32I-NEXT: .LBB18_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s3, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: or t4, s2, a6 +; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: bltu a5, t6, .LBB18_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: srai a6, a4, 31 +; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB18_32: +; RV32I-NEXT: slli a6, ra, 8 +; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: srl s2, t4, a5 +; RV32I-NEXT: sll ra, t3, s6 +; RV32I-NEXT: bltu a5, t5, .LBB18_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: j .LBB18_35 +; RV32I-NEXT: .LBB18_34: +; RV32I-NEXT: or s4, s2, ra +; RV32I-NEXT: .LBB18_35: +; RV32I-NEXT: or a6, a6, s1 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a0, s3, a0 +; RV32I-NEXT: slli s1, s0, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a5, .LBB18_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s4 +; RV32I-NEXT: .LBB18_37: +; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: bltu a5, t5, .LBB18_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: srl a3, a0, a5 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: bnez a5, .LBB18_40 +; RV32I-NEXT: j .LBB18_41 +; RV32I-NEXT: .LBB18_39: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: srl a3, s0, a5 +; RV32I-NEXT: sll a6, a0, s6 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: beqz a5, .LBB18_41 +; RV32I-NEXT: .LBB18_40: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB18_41: +; RV32I-NEXT: bltu a5, t5, .LBB18_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t5, .LBB18_45 +; RV32I-NEXT: .LBB18_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: bnez s7, .LBB18_46 +; RV32I-NEXT: j .LBB18_47 +; RV32I-NEXT: .LBB18_44: +; RV32I-NEXT: srl s1, a0, a5 +; RV32I-NEXT: bltu s7, t5, .LBB18_43 +; RV32I-NEXT: .LBB18_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: beqz s7, .LBB18_47 +; RV32I-NEXT: .LBB18_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB18_47: +; RV32I-NEXT: bltu s9, t5, .LBB18_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t3, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB18_50 +; RV32I-NEXT: j .LBB18_51 +; RV32I-NEXT: .LBB18_49: +; RV32I-NEXT: sll a3, t3, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB18_51 +; RV32I-NEXT: .LBB18_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB18_51: +; RV32I-NEXT: bltu s9, t5, .LBB18_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a5, t6, .LBB18_54 +; RV32I-NEXT: j .LBB18_55 +; RV32I-NEXT: .LBB18_53: +; RV32I-NEXT: srl s7, t3, a5 +; RV32I-NEXT: bgeu a5, t6, .LBB18_55 +; RV32I-NEXT: .LBB18_54: +; RV32I-NEXT: or s2, a6, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB18_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a5, .LBB18_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a6, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB18_57: +; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a5 +; RV32I-NEXT: bltu a5, t6, .LBB18_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: .LBB18_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a6, t1, s3 +; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB18_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: j .LBB18_62 +; RV32I-NEXT: .LBB18_61: +; RV32I-NEXT: sll s11, t1, s6 +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: .LBB18_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a7 +; RV32I-NEXT: beqz s2, .LBB18_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB18_64: +; RV32I-NEXT: bltu s1, t5, .LBB18_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a7, s1 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez s1, .LBB18_67 +; RV32I-NEXT: j .LBB18_68 +; RV32I-NEXT: .LBB18_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a7, a3 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz s1, .LBB18_68 +; RV32I-NEXT: .LBB18_67: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB18_68: +; RV32I-NEXT: bltu s1, t5, .LBB18_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t5, .LBB18_72 +; RV32I-NEXT: .LBB18_70: +; RV32I-NEXT: sll s6, t2, s6 +; RV32I-NEXT: srl a3, t2, s3 +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB18_73 +; RV32I-NEXT: .LBB18_71: +; RV32I-NEXT: srl s1, a7, s3 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB18_70 +; RV32I-NEXT: .LBB18_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t2, s2 +; RV32I-NEXT: .LBB18_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, a4 +; RV32I-NEXT: beqz s2, .LBB18_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB18_75: +; RV32I-NEXT: bltu s9, t5, .LBB18_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t1, s9 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: bnez s9, .LBB18_78 +; RV32I-NEXT: j .LBB18_79 +; RV32I-NEXT: .LBB18_77: +; RV32I-NEXT: sll s3, t1, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: sll s4, a7, s2 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: beqz s9, .LBB18_79 +; RV32I-NEXT: .LBB18_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB18_79: +; RV32I-NEXT: bltu s2, t6, .LBB18_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB18_82 +; RV32I-NEXT: .LBB18_81: +; RV32I-NEXT: or s3, a6, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB18_82: +; RV32I-NEXT: addi ra, a5, -128 +; RV32I-NEXT: mv s4, t2 +; RV32I-NEXT: mv s6, a4 +; RV32I-NEXT: beqz s2, .LBB18_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s4, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB18_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, a4, s9 +; RV32I-NEXT: bltu ra, t5, .LBB18_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: sra a3, a4, ra +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: bnez ra, .LBB18_87 +; RV32I-NEXT: j .LBB18_88 +; RV32I-NEXT: .LBB18_86: +; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: beqz ra, .LBB18_88 +; RV32I-NEXT: .LBB18_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB18_88: +; RV32I-NEXT: bltu ra, t5, .LBB18_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: srai s2, a4, 31 +; RV32I-NEXT: srl a3, a7, ra +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez ra, .LBB18_91 +; RV32I-NEXT: j .LBB18_92 +; RV32I-NEXT: .LBB18_90: +; RV32I-NEXT: sra s2, a4, a5 +; RV32I-NEXT: sll a3, a7, s9 +; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz ra, .LBB18_92 +; RV32I-NEXT: .LBB18_91: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB18_92: +; RV32I-NEXT: mv s5, t0 +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: li t0, 64 +; RV32I-NEXT: bltu ra, t5, .LBB18_94 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: j .LBB18_95 +; RV32I-NEXT: .LBB18_94: +; RV32I-NEXT: srl s7, a7, a5 +; RV32I-NEXT: .LBB18_95: +; RV32I-NEXT: mv t6, s8 +; RV32I-NEXT: mv s8, s11 +; RV32I-NEXT: bltu s10, t5, .LBB18_97 +; RV32I-NEXT: # %bb.96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t2, s10 +; RV32I-NEXT: j .LBB18_98 +; RV32I-NEXT: .LBB18_97: +; RV32I-NEXT: sll s9, t2, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t2, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: .LBB18_98: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, a4 +; RV32I-NEXT: beqz s10, .LBB18_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB18_100: +; RV32I-NEXT: bltu s11, t5, .LBB18_102 +; RV32I-NEXT: # %bb.101: +; RV32I-NEXT: sra a3, a4, s11 +; RV32I-NEXT: bnez s11, .LBB18_103 +; RV32I-NEXT: j .LBB18_104 +; RV32I-NEXT: .LBB18_102: +; RV32I-NEXT: srl a3, t2, ra +; RV32I-NEXT: mv s10, s4 +; RV32I-NEXT: neg s4, s11 +; RV32I-NEXT: sll s4, a4, s4 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s4, s10 +; RV32I-NEXT: beqz s11, .LBB18_104 +; RV32I-NEXT: .LBB18_103: +; RV32I-NEXT: mv t2, a3 +; RV32I-NEXT: .LBB18_104: +; RV32I-NEXT: bltu s11, t5, .LBB18_106 +; RV32I-NEXT: # %bb.105: +; RV32I-NEXT: srai t5, a4, 31 +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bltu ra, t0, .LBB18_107 +; RV32I-NEXT: j .LBB18_108 +; RV32I-NEXT: .LBB18_106: +; RV32I-NEXT: sra t5, a4, ra +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bgeu ra, t0, .LBB18_108 +; RV32I-NEXT: .LBB18_107: +; RV32I-NEXT: or t2, a6, s9 +; RV32I-NEXT: or t5, s7, s3 +; RV32I-NEXT: .LBB18_108: +; RV32I-NEXT: li a6, 128 +; RV32I-NEXT: bnez ra, .LBB18_117 +; RV32I-NEXT: # %bb.109: +; RV32I-NEXT: bgeu ra, t0, .LBB18_118 +; RV32I-NEXT: .LBB18_110: +; RV32I-NEXT: bgeu a5, a6, .LBB18_112 +; RV32I-NEXT: .LBB18_111: +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a3, s8 +; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a7, a3, t6 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s4 +; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: .LBB18_112: +; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: mv t0, s5 +; RV32I-NEXT: beqz a5, .LBB18_114 +; RV32I-NEXT: # %bb.113: +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t3, s2 +; RV32I-NEXT: .LBB18_114: +; RV32I-NEXT: bltu a5, a6, .LBB18_116 +; RV32I-NEXT: # %bb.115: +; RV32I-NEXT: srai a1, a4, 31 +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: mv s11, a1 +; RV32I-NEXT: mv ra, a1 +; RV32I-NEXT: .LBB18_116: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli a7, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t2, t3, 16 +; RV32I-NEXT: srli s3, t3, 24 +; RV32I-NEXT: srli s1, a1, 16 +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: srli t6, t0, 16 +; RV32I-NEXT: srli s6, t0, 24 +; RV32I-NEXT: srli s5, s11, 16 +; RV32I-NEXT: srli s4, s11, 24 +; RV32I-NEXT: srli s7, ra, 16 +; RV32I-NEXT: srli s8, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s9, s0, t1 +; RV32I-NEXT: and s10, a0, t1 +; RV32I-NEXT: srli s9, s9, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s9, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli a7, s10, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t3, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, a1, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t3, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, t0, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a1, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a1, s11, t1 +; RV32I-NEXT: and a3, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb t0, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s4, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb s7, 30(a2) +; RV32I-NEXT: sb s8, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB18_117: +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: mv a7, t5 +; RV32I-NEXT: bltu ra, t0, .LBB18_110 +; RV32I-NEXT: .LBB18_118: +; RV32I-NEXT: srai s1, a4, 31 +; RV32I-NEXT: mv s2, s1 +; RV32I-NEXT: bltu a5, a6, .LBB18_111 +; RV32I-NEXT: j .LBB18_112 + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli t3, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or a5, t5, t4 +; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a6, a6, 5 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sll t4, a5, t5 +; RV64I-NEXT: bltu a6, t0, .LBB19_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sra t6, a5, t1 +; RV64I-NEXT: j .LBB19_3 +; RV64I-NEXT: .LBB19_2: +; RV64I-NEXT: srl t6, a7, a6 +; RV64I-NEXT: or t6, t6, t4 +; RV64I-NEXT: .LBB19_3: +; RV64I-NEXT: or a3, t3, a3 +; RV64I-NEXT: slli t3, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a7 +; RV64I-NEXT: beqz a6, .LBB19_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB19_5: +; RV64I-NEXT: or a4, t3, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a6, t0, .LBB19_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: srai a0, a5, 63 +; RV64I-NEXT: srl t3, a3, t1 +; RV64I-NEXT: j .LBB19_8 +; RV64I-NEXT: .LBB19_7: +; RV64I-NEXT: sra a0, a5, a6 +; RV64I-NEXT: srl t1, a4, a6 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t3, t1, t2 +; RV64I-NEXT: .LBB19_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a6, .LBB19_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t3 +; RV64I-NEXT: .LBB19_10: +; RV64I-NEXT: sub t6, t1, a6 +; RV64I-NEXT: bltu a6, t0, .LBB19_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t3, 0 +; RV64I-NEXT: bgeu t6, t0, .LBB19_14 +; RV64I-NEXT: .LBB19_12: +; RV64I-NEXT: sll t5, a7, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a7, s0 +; RV64I-NEXT: or s1, s0, t4 +; RV64I-NEXT: j .LBB19_15 +; RV64I-NEXT: .LBB19_13: +; RV64I-NEXT: srl t3, a3, a6 +; RV64I-NEXT: bltu t6, t0, .LBB19_12 +; RV64I-NEXT: .LBB19_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: sll s1, a7, t4 +; RV64I-NEXT: .LBB19_15: +; RV64I-NEXT: sub s0, a6, t1 +; RV64I-NEXT: mv t4, a5 +; RV64I-NEXT: beqz t6, .LBB19_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t4, s1 +; RV64I-NEXT: .LBB19_17: +; RV64I-NEXT: bltu s0, t0, .LBB19_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: sra t6, a5, t6 +; RV64I-NEXT: bnez s0, .LBB19_20 +; RV64I-NEXT: j .LBB19_21 +; RV64I-NEXT: .LBB19_19: +; RV64I-NEXT: srl t6, a7, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, a5, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB19_21 +; RV64I-NEXT: .LBB19_20: +; RV64I-NEXT: mv a7, t6 +; RV64I-NEXT: .LBB19_21: +; RV64I-NEXT: bltu s0, t0, .LBB19_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: srai t0, a5, 63 +; RV64I-NEXT: bltu a6, t1, .LBB19_24 +; RV64I-NEXT: j .LBB19_25 +; RV64I-NEXT: .LBB19_23: +; RV64I-NEXT: sra t0, a5, s0 +; RV64I-NEXT: bgeu a6, t1, .LBB19_25 +; RV64I-NEXT: .LBB19_24: +; RV64I-NEXT: or a7, t2, t5 +; RV64I-NEXT: or t0, t3, t4 +; RV64I-NEXT: .LBB19_25: +; RV64I-NEXT: bnez a6, .LBB19_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a6, t1, .LBB19_28 +; RV64I-NEXT: .LBB19_27: +; RV64I-NEXT: srai a1, a5, 63 +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB19_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB19_29: +; RV64I-NEXT: mv a4, a7 +; RV64I-NEXT: mv a3, t0 +; RV64I-NEXT: bgeu a6, t1, .LBB19_27 +; RV64I-NEXT: j .LBB19_28 +; +; RV32I-LABEL: ashr_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t5, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t5, 32 +; RV32I-NEXT: slli a7, a4, 16 +; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli a5, t2, 16 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or a4, t1, t0 +; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli a5, a5, 5 +; RV32I-NEXT: srl s0, t2, a5 +; RV32I-NEXT: neg s6, a5 +; RV32I-NEXT: sll s1, a4, s6 +; RV32I-NEXT: bltu a5, t5, .LBB19_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra t0, a4, a5 +; RV32I-NEXT: j .LBB19_3 +; RV32I-NEXT: .LBB19_2: +; RV32I-NEXT: or t0, s0, s1 +; RV32I-NEXT: .LBB19_3: +; RV32I-NEXT: or t1, a7, a3 +; RV32I-NEXT: or a7, a1, a6 +; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: beqz a5, .LBB19_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv t3, t0 +; RV32I-NEXT: .LBB19_5: +; RV32I-NEXT: srl a3, t1, a5 +; RV32I-NEXT: sll a1, a7, s6 +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a5, t5, .LBB19_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srai t4, a4, 31 +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: j .LBB19_8 +; RV32I-NEXT: .LBB19_7: +; RV32I-NEXT: sra t4, a4, a5 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: .LBB19_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: beqz a5, .LBB19_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: .LBB19_10: +; RV32I-NEXT: sub s7, t6, a5 +; RV32I-NEXT: bltu a5, t5, .LBB19_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB19_13 +; RV32I-NEXT: .LBB19_12: +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: .LBB19_13: +; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: bltu s7, t5, .LBB19_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll a3, t2, s7 +; RV32I-NEXT: j .LBB19_16 +; RV32I-NEXT: .LBB19_15: +; RV32I-NEXT: sll a6, t2, s6 +; RV32I-NEXT: srl a3, t2, s10 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: .LBB19_16: +; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi s9, a5, -64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz s7, .LBB19_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t3, a3 +; RV32I-NEXT: .LBB19_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t5, .LBB19_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: sra s0, a4, s9 +; RV32I-NEXT: j .LBB19_21 +; RV32I-NEXT: .LBB19_20: +; RV32I-NEXT: sll a3, a4, s11 +; RV32I-NEXT: or s0, s0, a3 +; RV32I-NEXT: .LBB19_21: +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: beqz s9, .LBB19_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv t4, s0 +; RV32I-NEXT: .LBB19_23: +; RV32I-NEXT: lbu s2, 9(a0) +; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: bltu s9, t5, .LBB19_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: srai s0, a4, 31 +; RV32I-NEXT: j .LBB19_26 +; RV32I-NEXT: .LBB19_25: +; RV32I-NEXT: sra s0, a4, a5 +; RV32I-NEXT: .LBB19_26: +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s8, 8 +; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: bgeu a5, t6, .LBB19_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or t4, t0, a6 +; RV32I-NEXT: or s0, a1, t3 +; RV32I-NEXT: .LBB19_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: or a6, s2, s5 +; RV32I-NEXT: slli s2, s1, 16 +; RV32I-NEXT: or s1, s4, s3 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: beqz a5, .LBB19_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv a1, t4 +; RV32I-NEXT: mv t0, s0 +; RV32I-NEXT: .LBB19_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s3, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: or t4, s2, a6 +; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: bltu a5, t6, .LBB19_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: srai a6, a4, 31 +; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB19_32: +; RV32I-NEXT: slli a6, ra, 8 +; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: srl s2, t4, a5 +; RV32I-NEXT: sll ra, t3, s6 +; RV32I-NEXT: bltu a5, t5, .LBB19_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: j .LBB19_35 +; RV32I-NEXT: .LBB19_34: +; RV32I-NEXT: or s4, s2, ra +; RV32I-NEXT: .LBB19_35: +; RV32I-NEXT: or a6, a6, s1 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a0, s3, a0 +; RV32I-NEXT: slli s1, s0, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a5, .LBB19_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s4 +; RV32I-NEXT: .LBB19_37: +; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: bltu a5, t5, .LBB19_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: srl a3, a0, a5 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: bnez a5, .LBB19_40 +; RV32I-NEXT: j .LBB19_41 +; RV32I-NEXT: .LBB19_39: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: srl a3, s0, a5 +; RV32I-NEXT: sll a6, a0, s6 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: beqz a5, .LBB19_41 +; RV32I-NEXT: .LBB19_40: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB19_41: +; RV32I-NEXT: bltu a5, t5, .LBB19_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t5, .LBB19_45 +; RV32I-NEXT: .LBB19_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: bnez s7, .LBB19_46 +; RV32I-NEXT: j .LBB19_47 +; RV32I-NEXT: .LBB19_44: +; RV32I-NEXT: srl s1, a0, a5 +; RV32I-NEXT: bltu s7, t5, .LBB19_43 +; RV32I-NEXT: .LBB19_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: beqz s7, .LBB19_47 +; RV32I-NEXT: .LBB19_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB19_47: +; RV32I-NEXT: bltu s9, t5, .LBB19_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t3, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB19_50 +; RV32I-NEXT: j .LBB19_51 +; RV32I-NEXT: .LBB19_49: +; RV32I-NEXT: sll a3, t3, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB19_51 +; RV32I-NEXT: .LBB19_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB19_51: +; RV32I-NEXT: bltu s9, t5, .LBB19_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a5, t6, .LBB19_54 +; RV32I-NEXT: j .LBB19_55 +; RV32I-NEXT: .LBB19_53: +; RV32I-NEXT: srl s7, t3, a5 +; RV32I-NEXT: bgeu a5, t6, .LBB19_55 +; RV32I-NEXT: .LBB19_54: +; RV32I-NEXT: or s2, a6, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB19_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a5, .LBB19_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a6, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB19_57: +; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a5 +; RV32I-NEXT: bltu a5, t6, .LBB19_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: .LBB19_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a6, t1, s3 +; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB19_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: j .LBB19_62 +; RV32I-NEXT: .LBB19_61: +; RV32I-NEXT: sll s11, t1, s6 +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: .LBB19_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a7 +; RV32I-NEXT: beqz s2, .LBB19_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB19_64: +; RV32I-NEXT: bltu s1, t5, .LBB19_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a7, s1 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez s1, .LBB19_67 +; RV32I-NEXT: j .LBB19_68 +; RV32I-NEXT: .LBB19_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a7, a3 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz s1, .LBB19_68 +; RV32I-NEXT: .LBB19_67: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB19_68: +; RV32I-NEXT: bltu s1, t5, .LBB19_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t5, .LBB19_72 +; RV32I-NEXT: .LBB19_70: +; RV32I-NEXT: sll s6, t2, s6 +; RV32I-NEXT: srl a3, t2, s3 +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB19_73 +; RV32I-NEXT: .LBB19_71: +; RV32I-NEXT: srl s1, a7, s3 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB19_70 +; RV32I-NEXT: .LBB19_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t2, s2 +; RV32I-NEXT: .LBB19_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, a4 +; RV32I-NEXT: beqz s2, .LBB19_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB19_75: +; RV32I-NEXT: bltu s9, t5, .LBB19_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t1, s9 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: bnez s9, .LBB19_78 +; RV32I-NEXT: j .LBB19_79 +; RV32I-NEXT: .LBB19_77: +; RV32I-NEXT: sll s3, t1, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: sll s4, a7, s2 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: beqz s9, .LBB19_79 +; RV32I-NEXT: .LBB19_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB19_79: +; RV32I-NEXT: bltu s2, t6, .LBB19_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB19_82 +; RV32I-NEXT: .LBB19_81: +; RV32I-NEXT: or s3, a6, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB19_82: +; RV32I-NEXT: addi ra, a5, -128 +; RV32I-NEXT: mv s4, t2 +; RV32I-NEXT: mv s6, a4 +; RV32I-NEXT: beqz s2, .LBB19_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s4, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB19_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, a4, s9 +; RV32I-NEXT: bltu ra, t5, .LBB19_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: sra a3, a4, ra +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: bnez ra, .LBB19_87 +; RV32I-NEXT: j .LBB19_88 +; RV32I-NEXT: .LBB19_86: +; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: beqz ra, .LBB19_88 +; RV32I-NEXT: .LBB19_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB19_88: +; RV32I-NEXT: bltu ra, t5, .LBB19_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: srai s2, a4, 31 +; RV32I-NEXT: srl a3, a7, ra +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez ra, .LBB19_91 +; RV32I-NEXT: j .LBB19_92 +; RV32I-NEXT: .LBB19_90: +; RV32I-NEXT: sra s2, a4, a5 +; RV32I-NEXT: sll a3, a7, s9 +; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz ra, .LBB19_92 +; RV32I-NEXT: .LBB19_91: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB19_92: +; RV32I-NEXT: mv s5, t0 +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: li t0, 64 +; RV32I-NEXT: bltu ra, t5, .LBB19_94 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: j .LBB19_95 +; RV32I-NEXT: .LBB19_94: +; RV32I-NEXT: srl s7, a7, a5 +; RV32I-NEXT: .LBB19_95: +; RV32I-NEXT: mv t6, s8 +; RV32I-NEXT: mv s8, s11 +; RV32I-NEXT: bltu s10, t5, .LBB19_97 +; RV32I-NEXT: # %bb.96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t2, s10 +; RV32I-NEXT: j .LBB19_98 +; RV32I-NEXT: .LBB19_97: +; RV32I-NEXT: sll s9, t2, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t2, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: .LBB19_98: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, a4 +; RV32I-NEXT: beqz s10, .LBB19_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB19_100: +; RV32I-NEXT: bltu s11, t5, .LBB19_102 +; RV32I-NEXT: # %bb.101: +; RV32I-NEXT: sra a3, a4, s11 +; RV32I-NEXT: bnez s11, .LBB19_103 +; RV32I-NEXT: j .LBB19_104 +; RV32I-NEXT: .LBB19_102: +; RV32I-NEXT: srl a3, t2, ra +; RV32I-NEXT: mv s10, s4 +; RV32I-NEXT: neg s4, s11 +; RV32I-NEXT: sll s4, a4, s4 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s4, s10 +; RV32I-NEXT: beqz s11, .LBB19_104 +; RV32I-NEXT: .LBB19_103: +; RV32I-NEXT: mv t2, a3 +; RV32I-NEXT: .LBB19_104: +; RV32I-NEXT: bltu s11, t5, .LBB19_106 +; RV32I-NEXT: # %bb.105: +; RV32I-NEXT: srai t5, a4, 31 +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bltu ra, t0, .LBB19_107 +; RV32I-NEXT: j .LBB19_108 +; RV32I-NEXT: .LBB19_106: +; RV32I-NEXT: sra t5, a4, ra +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bgeu ra, t0, .LBB19_108 +; RV32I-NEXT: .LBB19_107: +; RV32I-NEXT: or t2, a6, s9 +; RV32I-NEXT: or t5, s7, s3 +; RV32I-NEXT: .LBB19_108: +; RV32I-NEXT: li a6, 128 +; RV32I-NEXT: bnez ra, .LBB19_117 +; RV32I-NEXT: # %bb.109: +; RV32I-NEXT: bgeu ra, t0, .LBB19_118 +; RV32I-NEXT: .LBB19_110: +; RV32I-NEXT: bgeu a5, a6, .LBB19_112 +; RV32I-NEXT: .LBB19_111: +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a3, s8 +; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a7, a3, t6 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s4 +; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: .LBB19_112: +; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: mv t0, s5 +; RV32I-NEXT: beqz a5, .LBB19_114 +; RV32I-NEXT: # %bb.113: +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t3, s2 +; RV32I-NEXT: .LBB19_114: +; RV32I-NEXT: bltu a5, a6, .LBB19_116 +; RV32I-NEXT: # %bb.115: +; RV32I-NEXT: srai a1, a4, 31 +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: mv s11, a1 +; RV32I-NEXT: mv ra, a1 +; RV32I-NEXT: .LBB19_116: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli a7, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t2, t3, 16 +; RV32I-NEXT: srli s3, t3, 24 +; RV32I-NEXT: srli s1, a1, 16 +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: srli t6, t0, 16 +; RV32I-NEXT: srli s6, t0, 24 +; RV32I-NEXT: srli s5, s11, 16 +; RV32I-NEXT: srli s4, s11, 24 +; RV32I-NEXT: srli s7, ra, 16 +; RV32I-NEXT: srli s8, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s9, s0, t1 +; RV32I-NEXT: and s10, a0, t1 +; RV32I-NEXT: srli s9, s9, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s9, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli a7, s10, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t3, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, a1, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t3, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, t0, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a1, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a1, s11, t1 +; RV32I-NEXT: and a3, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb t0, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s4, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb s7, 30(a2) +; RV32I-NEXT: sb s8, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB19_117: +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: mv a7, t5 +; RV32I-NEXT: bltu ra, t0, .LBB19_110 +; RV32I-NEXT: .LBB19_118: +; RV32I-NEXT: srai s1, a4, 31 +; RV32I-NEXT: mv s2, s1 +; RV32I-NEXT: bltu a5, a6, .LBB19_111 +; RV32I-NEXT: j .LBB19_112 + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: sd s0, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: lbu t3, 8(a0) +; RV64I-NEXT: lbu t4, 9(a0) +; RV64I-NEXT: lbu t5, 10(a0) +; RV64I-NEXT: lbu t6, 11(a0) +; RV64I-NEXT: lbu s0, 12(a0) +; RV64I-NEXT: lbu s1, 13(a0) +; RV64I-NEXT: lbu s2, 14(a0) +; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: lbu s4, 16(a0) +; RV64I-NEXT: lbu s5, 17(a0) +; RV64I-NEXT: lbu s6, 18(a0) +; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a4, t4, t3 +; RV64I-NEXT: or a6, t6, t5 +; RV64I-NEXT: or t0, s1, s0 +; RV64I-NEXT: lbu t5, 24(a0) +; RV64I-NEXT: lbu t6, 25(a0) +; RV64I-NEXT: lbu s0, 26(a0) +; RV64I-NEXT: lbu s1, 27(a0) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or t4, s3, s2 +; RV64I-NEXT: or t2, s5, s4 +; RV64I-NEXT: or t3, s7, s6 +; RV64I-NEXT: lbu s2, 28(a0) +; RV64I-NEXT: lbu s3, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: or s6, s11, s10 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu t6, 0(a1) +; RV64I-NEXT: lbu s1, 1(a1) +; RV64I-NEXT: lbu s7, 2(a1) +; RV64I-NEXT: lbu s8, 3(a1) +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, a0, s4 +; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: lbu a0, 4(a1) +; RV64I-NEXT: lbu s1, 5(a1) +; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or s7, s8, s7 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or s4, a1, s4 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a1, t1, a7 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or a0, t4, t0 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: or t1, s6, s5 +; RV64I-NEXT: slli s0, s0, 16 +; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: slli s3, s3, 16 +; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: or t6, s7, t6 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: li t0, 64 +; RV64I-NEXT: slli t3, a5, 16 +; RV64I-NEXT: slli t2, a6, 16 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: slli t5, t5, 32 +; RV64I-NEXT: slli s0, s0, 32 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or a5, t5, t4 +; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a6, a6, 6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sll t4, a5, t5 +; RV64I-NEXT: bltu a6, t0, .LBB20_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sra t6, a5, t1 +; RV64I-NEXT: j .LBB20_3 +; RV64I-NEXT: .LBB20_2: +; RV64I-NEXT: srl t6, a7, a6 +; RV64I-NEXT: or t6, t6, t4 +; RV64I-NEXT: .LBB20_3: +; RV64I-NEXT: or a3, t3, a3 +; RV64I-NEXT: slli t3, a1, 32 +; RV64I-NEXT: or t2, t2, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: mv a1, a7 +; RV64I-NEXT: beqz a6, .LBB20_5 +; RV64I-NEXT: # %bb.4: +; RV64I-NEXT: mv a1, t6 +; RV64I-NEXT: .LBB20_5: +; RV64I-NEXT: or a4, t3, a3 +; RV64I-NEXT: or a3, a0, t2 +; RV64I-NEXT: bltu a6, t0, .LBB20_7 +; RV64I-NEXT: # %bb.6: +; RV64I-NEXT: srai a0, a5, 63 +; RV64I-NEXT: srl t3, a3, t1 +; RV64I-NEXT: j .LBB20_8 +; RV64I-NEXT: .LBB20_7: +; RV64I-NEXT: sra a0, a5, a6 +; RV64I-NEXT: srl t1, a4, a6 +; RV64I-NEXT: sll t2, a3, t5 +; RV64I-NEXT: or t3, t1, t2 +; RV64I-NEXT: .LBB20_8: +; RV64I-NEXT: li t1, 128 +; RV64I-NEXT: mv t2, a4 +; RV64I-NEXT: beqz a6, .LBB20_10 +; RV64I-NEXT: # %bb.9: +; RV64I-NEXT: mv t2, t3 +; RV64I-NEXT: .LBB20_10: +; RV64I-NEXT: sub t6, t1, a6 +; RV64I-NEXT: bltu a6, t0, .LBB20_13 +; RV64I-NEXT: # %bb.11: +; RV64I-NEXT: li t3, 0 +; RV64I-NEXT: bgeu t6, t0, .LBB20_14 +; RV64I-NEXT: .LBB20_12: +; RV64I-NEXT: sll t5, a7, t5 +; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: srl s0, a7, s0 +; RV64I-NEXT: or s1, s0, t4 +; RV64I-NEXT: j .LBB20_15 +; RV64I-NEXT: .LBB20_13: +; RV64I-NEXT: srl t3, a3, a6 +; RV64I-NEXT: bltu t6, t0, .LBB20_12 +; RV64I-NEXT: .LBB20_14: +; RV64I-NEXT: li t5, 0 +; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: sll s1, a7, t4 +; RV64I-NEXT: .LBB20_15: +; RV64I-NEXT: sub s0, a6, t1 +; RV64I-NEXT: mv t4, a5 +; RV64I-NEXT: beqz t6, .LBB20_17 +; RV64I-NEXT: # %bb.16: +; RV64I-NEXT: mv t4, s1 +; RV64I-NEXT: .LBB20_17: +; RV64I-NEXT: bltu s0, t0, .LBB20_19 +; RV64I-NEXT: # %bb.18: +; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: sra t6, a5, t6 +; RV64I-NEXT: bnez s0, .LBB20_20 +; RV64I-NEXT: j .LBB20_21 +; RV64I-NEXT: .LBB20_19: +; RV64I-NEXT: srl t6, a7, s0 +; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: sll s1, a5, s1 +; RV64I-NEXT: or t6, t6, s1 +; RV64I-NEXT: beqz s0, .LBB20_21 +; RV64I-NEXT: .LBB20_20: +; RV64I-NEXT: mv a7, t6 +; RV64I-NEXT: .LBB20_21: +; RV64I-NEXT: bltu s0, t0, .LBB20_23 +; RV64I-NEXT: # %bb.22: +; RV64I-NEXT: srai t0, a5, 63 +; RV64I-NEXT: bltu a6, t1, .LBB20_24 +; RV64I-NEXT: j .LBB20_25 +; RV64I-NEXT: .LBB20_23: +; RV64I-NEXT: sra t0, a5, s0 +; RV64I-NEXT: bgeu a6, t1, .LBB20_25 +; RV64I-NEXT: .LBB20_24: +; RV64I-NEXT: or a7, t2, t5 +; RV64I-NEXT: or t0, t3, t4 +; RV64I-NEXT: .LBB20_25: +; RV64I-NEXT: bnez a6, .LBB20_29 +; RV64I-NEXT: # %bb.26: +; RV64I-NEXT: bltu a6, t1, .LBB20_28 +; RV64I-NEXT: .LBB20_27: +; RV64I-NEXT: srai a1, a5, 63 +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: .LBB20_28: +; RV64I-NEXT: srli a5, a4, 32 +; RV64I-NEXT: srliw a6, a4, 16 +; RV64I-NEXT: lui t2, 16 +; RV64I-NEXT: srliw t1, a4, 24 +; RV64I-NEXT: srli t0, a4, 48 +; RV64I-NEXT: srli t5, a4, 56 +; RV64I-NEXT: srli a7, a3, 32 +; RV64I-NEXT: srliw t4, a3, 16 +; RV64I-NEXT: srliw s0, a3, 24 +; RV64I-NEXT: srli t6, a3, 48 +; RV64I-NEXT: srli s3, a3, 56 +; RV64I-NEXT: srli t3, a1, 32 +; RV64I-NEXT: srliw s2, a1, 16 +; RV64I-NEXT: srliw s6, a1, 24 +; RV64I-NEXT: srli s4, a1, 48 +; RV64I-NEXT: srli s7, a1, 56 +; RV64I-NEXT: srli s1, a0, 32 +; RV64I-NEXT: srliw s5, a0, 16 +; RV64I-NEXT: srliw s8, a0, 24 +; RV64I-NEXT: srli s9, a0, 48 +; RV64I-NEXT: srli s10, a0, 56 +; RV64I-NEXT: addi t2, t2, -1 +; RV64I-NEXT: and s11, a4, t2 +; RV64I-NEXT: srli s11, s11, 8 +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s11, 1(a2) +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: sb t1, 3(a2) +; RV64I-NEXT: and a4, a5, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: and a4, a3, t2 +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb s0, 11(a2) +; RV64I-NEXT: and a3, a7, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: sb t6, 14(a2) +; RV64I-NEXT: sb s3, 15(a2) +; RV64I-NEXT: and a3, a1, t2 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: sb s2, 18(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: and a1, t3, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb t3, 20(a2) +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: sb s4, 22(a2) +; RV64I-NEXT: sb s7, 23(a2) +; RV64I-NEXT: and a1, a0, t2 +; RV64I-NEXT: and a3, s1, t2 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb s5, 26(a2) +; RV64I-NEXT: sb s8, 27(a2) +; RV64I-NEXT: sb s1, 28(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb s9, 30(a2) +; RV64I-NEXT: sb s10, 31(a2) +; RV64I-NEXT: ld s0, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB20_29: +; RV64I-NEXT: mv a4, a7 +; RV64I-NEXT: mv a3, t0 +; RV64I-NEXT: bgeu a6, t1, .LBB20_27 +; RV64I-NEXT: j .LBB20_28 +; +; RV32I-LABEL: ashr_32bytes_dwordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -96 +; RV32I-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 64(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 60(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 56(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s9, 52(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s10, 48(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s11, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 16(a0) +; RV32I-NEXT: lbu a4, 17(a0) +; RV32I-NEXT: lbu a5, 18(a0) +; RV32I-NEXT: lbu a6, 19(a0) +; RV32I-NEXT: lbu a7, 20(a0) +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 22(a0) +; RV32I-NEXT: lbu t2, 23(a0) +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu t4, 25(a0) +; RV32I-NEXT: lbu t5, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: lbu a7, 28(a0) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu t1, 30(a0) +; RV32I-NEXT: lbu t2, 31(a0) +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or t4, t6, t5 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t5, 1(a1) +; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: li t5, 32 +; RV32I-NEXT: slli a7, a4, 16 +; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli a5, t2, 16 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or a4, t1, t0 +; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli a5, a5, 6 +; RV32I-NEXT: srl s0, t2, a5 +; RV32I-NEXT: neg s6, a5 +; RV32I-NEXT: sll s1, a4, s6 +; RV32I-NEXT: bltu a5, t5, .LBB20_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sra t0, a4, a5 +; RV32I-NEXT: j .LBB20_3 +; RV32I-NEXT: .LBB20_2: +; RV32I-NEXT: or t0, s0, s1 +; RV32I-NEXT: .LBB20_3: +; RV32I-NEXT: or t1, a7, a3 +; RV32I-NEXT: or a7, a1, a6 +; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: beqz a5, .LBB20_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: mv t3, t0 +; RV32I-NEXT: .LBB20_5: +; RV32I-NEXT: srl a3, t1, a5 +; RV32I-NEXT: sll a1, a7, s6 +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu a5, t5, .LBB20_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: srai t4, a4, 31 +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: j .LBB20_8 +; RV32I-NEXT: .LBB20_7: +; RV32I-NEXT: sra t4, a4, a5 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: .LBB20_8: +; RV32I-NEXT: li t6, 64 +; RV32I-NEXT: mv t0, t1 +; RV32I-NEXT: beqz a5, .LBB20_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: .LBB20_10: +; RV32I-NEXT: sub s7, t6, a5 +; RV32I-NEXT: bltu a5, t5, .LBB20_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: j .LBB20_13 +; RV32I-NEXT: .LBB20_12: +; RV32I-NEXT: srl a1, a7, a5 +; RV32I-NEXT: .LBB20_13: +; RV32I-NEXT: sw a3, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: neg s10, s7 +; RV32I-NEXT: bltu s7, t5, .LBB20_15 +; RV32I-NEXT: # %bb.14: +; RV32I-NEXT: li a6, 0 +; RV32I-NEXT: sll a3, t2, s7 +; RV32I-NEXT: j .LBB20_16 +; RV32I-NEXT: .LBB20_15: +; RV32I-NEXT: sll a6, t2, s6 +; RV32I-NEXT: srl a3, t2, s10 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: .LBB20_16: +; RV32I-NEXT: sw t3, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi s9, a5, -64 +; RV32I-NEXT: mv t3, a4 +; RV32I-NEXT: beqz s7, .LBB20_18 +; RV32I-NEXT: # %bb.17: +; RV32I-NEXT: mv t3, a3 +; RV32I-NEXT: .LBB20_18: +; RV32I-NEXT: neg s11, s9 +; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s9, t5, .LBB20_20 +; RV32I-NEXT: # %bb.19: +; RV32I-NEXT: sra s0, a4, s9 +; RV32I-NEXT: j .LBB20_21 +; RV32I-NEXT: .LBB20_20: +; RV32I-NEXT: sll a3, a4, s11 +; RV32I-NEXT: or s0, s0, a3 +; RV32I-NEXT: .LBB20_21: +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu a3, 15(a0) +; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: beqz s9, .LBB20_23 +; RV32I-NEXT: # %bb.22: +; RV32I-NEXT: mv t4, s0 +; RV32I-NEXT: .LBB20_23: +; RV32I-NEXT: lbu s2, 9(a0) +; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: bltu s9, t5, .LBB20_25 +; RV32I-NEXT: # %bb.24: +; RV32I-NEXT: srai s0, a4, 31 +; RV32I-NEXT: j .LBB20_26 +; RV32I-NEXT: .LBB20_25: +; RV32I-NEXT: sra s0, a4, a5 +; RV32I-NEXT: .LBB20_26: +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s3, 12(a0) +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s4, s8, 8 +; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: bgeu a5, t6, .LBB20_28 +; RV32I-NEXT: # %bb.27: +; RV32I-NEXT: or t4, t0, a6 +; RV32I-NEXT: or s0, a1, t3 +; RV32I-NEXT: .LBB20_28: +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: or a6, s2, s5 +; RV32I-NEXT: slli s2, s1, 16 +; RV32I-NEXT: or s1, s4, s3 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: beqz a5, .LBB20_30 +; RV32I-NEXT: # %bb.29: +; RV32I-NEXT: mv a1, t4 +; RV32I-NEXT: mv t0, s0 +; RV32I-NEXT: .LBB20_30: +; RV32I-NEXT: slli s5, a3, 8 +; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu s3, 5(a0) +; RV32I-NEXT: lbu s0, 6(a0) +; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: or t4, s2, a6 +; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: bltu a5, t6, .LBB20_32 +; RV32I-NEXT: # %bb.31: +; RV32I-NEXT: srai a6, a4, 31 +; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: .LBB20_32: +; RV32I-NEXT: slli a6, ra, 8 +; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: srl s2, t4, a5 +; RV32I-NEXT: sll ra, t3, s6 +; RV32I-NEXT: bltu a5, t5, .LBB20_34 +; RV32I-NEXT: # %bb.33: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: j .LBB20_35 +; RV32I-NEXT: .LBB20_34: +; RV32I-NEXT: or s4, s2, ra +; RV32I-NEXT: .LBB20_35: +; RV32I-NEXT: or a6, a6, s1 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a0, s3, a0 +; RV32I-NEXT: slli s1, s0, 16 +; RV32I-NEXT: mv s5, t4 +; RV32I-NEXT: beqz a5, .LBB20_37 +; RV32I-NEXT: # %bb.36: +; RV32I-NEXT: mv s5, s4 +; RV32I-NEXT: .LBB20_37: +; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or a0, s1, a0 +; RV32I-NEXT: bltu a5, t5, .LBB20_39 +; RV32I-NEXT: # %bb.38: +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: srl a3, a0, a5 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: bnez a5, .LBB20_40 +; RV32I-NEXT: j .LBB20_41 +; RV32I-NEXT: .LBB20_39: +; RV32I-NEXT: srl s4, t3, a5 +; RV32I-NEXT: srl a3, s0, a5 +; RV32I-NEXT: sll a6, a0, s6 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: beqz a5, .LBB20_41 +; RV32I-NEXT: .LBB20_40: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB20_41: +; RV32I-NEXT: bltu a5, t5, .LBB20_44 +; RV32I-NEXT: # %bb.42: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: bgeu s7, t5, .LBB20_45 +; RV32I-NEXT: .LBB20_43: +; RV32I-NEXT: sll s3, t4, s6 +; RV32I-NEXT: srl a3, t4, s10 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: bnez s7, .LBB20_46 +; RV32I-NEXT: j .LBB20_47 +; RV32I-NEXT: .LBB20_44: +; RV32I-NEXT: srl s1, a0, a5 +; RV32I-NEXT: bltu s7, t5, .LBB20_43 +; RV32I-NEXT: .LBB20_45: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t4, s7 +; RV32I-NEXT: mv s10, t3 +; RV32I-NEXT: beqz s7, .LBB20_47 +; RV32I-NEXT: .LBB20_46: +; RV32I-NEXT: mv s10, a3 +; RV32I-NEXT: .LBB20_47: +; RV32I-NEXT: bltu s9, t5, .LBB20_49 +; RV32I-NEXT: # %bb.48: +; RV32I-NEXT: srl a3, t3, s9 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: bnez s9, .LBB20_50 +; RV32I-NEXT: j .LBB20_51 +; RV32I-NEXT: .LBB20_49: +; RV32I-NEXT: sll a3, t3, s11 +; RV32I-NEXT: or a3, s2, a3 +; RV32I-NEXT: mv s2, t4 +; RV32I-NEXT: beqz s9, .LBB20_51 +; RV32I-NEXT: .LBB20_50: +; RV32I-NEXT: mv s2, a3 +; RV32I-NEXT: .LBB20_51: +; RV32I-NEXT: bltu s9, t5, .LBB20_53 +; RV32I-NEXT: # %bb.52: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: bltu a5, t6, .LBB20_54 +; RV32I-NEXT: j .LBB20_55 +; RV32I-NEXT: .LBB20_53: +; RV32I-NEXT: srl s7, t3, a5 +; RV32I-NEXT: bgeu a5, t6, .LBB20_55 +; RV32I-NEXT: .LBB20_54: +; RV32I-NEXT: or s2, a6, s3 +; RV32I-NEXT: or s7, s1, s10 +; RV32I-NEXT: .LBB20_55: +; RV32I-NEXT: li a3, 128 +; RV32I-NEXT: mv a6, s0 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: beqz a5, .LBB20_57 +; RV32I-NEXT: # %bb.56: +; RV32I-NEXT: mv a6, s2 +; RV32I-NEXT: mv s1, s7 +; RV32I-NEXT: .LBB20_57: +; RV32I-NEXT: sw a6, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sub s2, a3, a5 +; RV32I-NEXT: bltu a5, t6, .LBB20_59 +; RV32I-NEXT: # %bb.58: +; RV32I-NEXT: li s5, 0 +; RV32I-NEXT: li s4, 0 +; RV32I-NEXT: .LBB20_59: +; RV32I-NEXT: neg s3, s2 +; RV32I-NEXT: srl a6, t1, s3 +; RV32I-NEXT: sw s4, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB20_61 +; RV32I-NEXT: # %bb.60: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: sll a3, t1, s2 +; RV32I-NEXT: j .LBB20_62 +; RV32I-NEXT: .LBB20_61: +; RV32I-NEXT: sll s11, t1, s6 +; RV32I-NEXT: lw a3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: .LBB20_62: +; RV32I-NEXT: sub s1, t6, s2 +; RV32I-NEXT: mv s8, a7 +; RV32I-NEXT: beqz s2, .LBB20_64 +; RV32I-NEXT: # %bb.63: +; RV32I-NEXT: mv s8, a3 +; RV32I-NEXT: .LBB20_64: +; RV32I-NEXT: bltu s1, t5, .LBB20_66 +; RV32I-NEXT: # %bb.65: +; RV32I-NEXT: srl a3, a7, s1 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez s1, .LBB20_67 +; RV32I-NEXT: j .LBB20_68 +; RV32I-NEXT: .LBB20_66: +; RV32I-NEXT: neg a3, s1 +; RV32I-NEXT: sll a3, a7, a3 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz s1, .LBB20_68 +; RV32I-NEXT: .LBB20_67: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB20_68: +; RV32I-NEXT: bltu s1, t5, .LBB20_71 +; RV32I-NEXT: # %bb.69: +; RV32I-NEXT: li s1, 0 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bgeu s2, t5, .LBB20_72 +; RV32I-NEXT: .LBB20_70: +; RV32I-NEXT: sll s6, t2, s6 +; RV32I-NEXT: srl a3, t2, s3 +; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: j .LBB20_73 +; RV32I-NEXT: .LBB20_71: +; RV32I-NEXT: srl s1, a7, s3 +; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: bltu s2, t5, .LBB20_70 +; RV32I-NEXT: .LBB20_72: +; RV32I-NEXT: li s6, 0 +; RV32I-NEXT: sll a3, t2, s2 +; RV32I-NEXT: .LBB20_73: +; RV32I-NEXT: addi s9, s2, -64 +; RV32I-NEXT: mv s5, a4 +; RV32I-NEXT: beqz s2, .LBB20_75 +; RV32I-NEXT: # %bb.74: +; RV32I-NEXT: mv s5, a3 +; RV32I-NEXT: .LBB20_75: +; RV32I-NEXT: bltu s9, t5, .LBB20_77 +; RV32I-NEXT: # %bb.76: +; RV32I-NEXT: li s3, 0 +; RV32I-NEXT: sll a3, t1, s9 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: bnez s9, .LBB20_78 +; RV32I-NEXT: j .LBB20_79 +; RV32I-NEXT: .LBB20_77: +; RV32I-NEXT: sll s3, t1, s2 +; RV32I-NEXT: neg a3, s9 +; RV32I-NEXT: srl a3, t1, a3 +; RV32I-NEXT: sll s4, a7, s2 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s7, a7 +; RV32I-NEXT: beqz s9, .LBB20_79 +; RV32I-NEXT: .LBB20_78: +; RV32I-NEXT: mv s7, a3 +; RV32I-NEXT: .LBB20_79: +; RV32I-NEXT: bltu s2, t6, .LBB20_81 +; RV32I-NEXT: # %bb.80: +; RV32I-NEXT: li s11, 0 +; RV32I-NEXT: li s8, 0 +; RV32I-NEXT: j .LBB20_82 +; RV32I-NEXT: .LBB20_81: +; RV32I-NEXT: or s3, a6, s6 +; RV32I-NEXT: or s7, s1, s5 +; RV32I-NEXT: .LBB20_82: +; RV32I-NEXT: addi ra, a5, -128 +; RV32I-NEXT: mv s4, t2 +; RV32I-NEXT: mv s6, a4 +; RV32I-NEXT: beqz s2, .LBB20_84 +; RV32I-NEXT: # %bb.83: +; RV32I-NEXT: mv s4, s3 +; RV32I-NEXT: mv s6, s7 +; RV32I-NEXT: .LBB20_84: +; RV32I-NEXT: neg s9, ra +; RV32I-NEXT: sll s3, a4, s9 +; RV32I-NEXT: bltu ra, t5, .LBB20_86 +; RV32I-NEXT: # %bb.85: +; RV32I-NEXT: sra a3, a4, ra +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: bnez ra, .LBB20_87 +; RV32I-NEXT: j .LBB20_88 +; RV32I-NEXT: .LBB20_86: +; RV32I-NEXT: lw a3, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: mv s1, t2 +; RV32I-NEXT: beqz ra, .LBB20_88 +; RV32I-NEXT: .LBB20_87: +; RV32I-NEXT: mv s1, a3 +; RV32I-NEXT: .LBB20_88: +; RV32I-NEXT: bltu ra, t5, .LBB20_90 +; RV32I-NEXT: # %bb.89: +; RV32I-NEXT: srai s2, a4, 31 +; RV32I-NEXT: srl a3, a7, ra +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: bnez ra, .LBB20_91 +; RV32I-NEXT: j .LBB20_92 +; RV32I-NEXT: .LBB20_90: +; RV32I-NEXT: sra s2, a4, a5 +; RV32I-NEXT: sll a3, a7, s9 +; RV32I-NEXT: lw a6, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: mv a6, t1 +; RV32I-NEXT: beqz ra, .LBB20_92 +; RV32I-NEXT: .LBB20_91: +; RV32I-NEXT: mv a6, a3 +; RV32I-NEXT: .LBB20_92: +; RV32I-NEXT: mv s5, t0 +; RV32I-NEXT: sub s10, t6, ra +; RV32I-NEXT: li t0, 64 +; RV32I-NEXT: bltu ra, t5, .LBB20_94 +; RV32I-NEXT: # %bb.93: +; RV32I-NEXT: li s7, 0 +; RV32I-NEXT: j .LBB20_95 +; RV32I-NEXT: .LBB20_94: +; RV32I-NEXT: srl s7, a7, a5 +; RV32I-NEXT: .LBB20_95: +; RV32I-NEXT: mv t6, s8 +; RV32I-NEXT: mv s8, s11 +; RV32I-NEXT: bltu s10, t5, .LBB20_97 +; RV32I-NEXT: # %bb.96: +; RV32I-NEXT: li s9, 0 +; RV32I-NEXT: sll a3, t2, s10 +; RV32I-NEXT: j .LBB20_98 +; RV32I-NEXT: .LBB20_97: +; RV32I-NEXT: sll s9, t2, s9 +; RV32I-NEXT: neg a3, s10 +; RV32I-NEXT: srl a3, t2, a3 +; RV32I-NEXT: or a3, a3, s3 +; RV32I-NEXT: .LBB20_98: +; RV32I-NEXT: addi s11, ra, -64 +; RV32I-NEXT: mv s3, a4 +; RV32I-NEXT: beqz s10, .LBB20_100 +; RV32I-NEXT: # %bb.99: +; RV32I-NEXT: mv s3, a3 +; RV32I-NEXT: .LBB20_100: +; RV32I-NEXT: bltu s11, t5, .LBB20_102 +; RV32I-NEXT: # %bb.101: +; RV32I-NEXT: sra a3, a4, s11 +; RV32I-NEXT: bnez s11, .LBB20_103 +; RV32I-NEXT: j .LBB20_104 +; RV32I-NEXT: .LBB20_102: +; RV32I-NEXT: srl a3, t2, ra +; RV32I-NEXT: mv s10, s4 +; RV32I-NEXT: neg s4, s11 +; RV32I-NEXT: sll s4, a4, s4 +; RV32I-NEXT: or a3, a3, s4 +; RV32I-NEXT: mv s4, s10 +; RV32I-NEXT: beqz s11, .LBB20_104 +; RV32I-NEXT: .LBB20_103: +; RV32I-NEXT: mv t2, a3 +; RV32I-NEXT: .LBB20_104: +; RV32I-NEXT: bltu s11, t5, .LBB20_106 +; RV32I-NEXT: # %bb.105: +; RV32I-NEXT: srai t5, a4, 31 +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bltu ra, t0, .LBB20_107 +; RV32I-NEXT: j .LBB20_108 +; RV32I-NEXT: .LBB20_106: +; RV32I-NEXT: sra t5, a4, ra +; RV32I-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: bgeu ra, t0, .LBB20_108 +; RV32I-NEXT: .LBB20_107: +; RV32I-NEXT: or t2, a6, s9 +; RV32I-NEXT: or t5, s7, s3 +; RV32I-NEXT: .LBB20_108: +; RV32I-NEXT: li a6, 128 +; RV32I-NEXT: bnez ra, .LBB20_117 +; RV32I-NEXT: # %bb.109: +; RV32I-NEXT: bgeu ra, t0, .LBB20_118 +; RV32I-NEXT: .LBB20_110: +; RV32I-NEXT: bgeu a5, a6, .LBB20_112 +; RV32I-NEXT: .LBB20_111: +; RV32I-NEXT: lw a3, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: or t1, a3, s8 +; RV32I-NEXT: lw a3, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a7, a3, t6 +; RV32I-NEXT: lw a3, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s1, a3, s4 +; RV32I-NEXT: lw a3, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or s2, a3, s6 +; RV32I-NEXT: .LBB20_112: +; RV32I-NEXT: lw ra, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: mv t0, s5 +; RV32I-NEXT: beqz a5, .LBB20_114 +; RV32I-NEXT: # %bb.113: +; RV32I-NEXT: mv s0, t1 +; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: mv t4, s1 +; RV32I-NEXT: mv t3, s2 +; RV32I-NEXT: .LBB20_114: +; RV32I-NEXT: bltu a5, a6, .LBB20_116 +; RV32I-NEXT: # %bb.115: +; RV32I-NEXT: srai a1, a4, 31 +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: mv s11, a1 +; RV32I-NEXT: mv ra, a1 +; RV32I-NEXT: .LBB20_116: +; RV32I-NEXT: srli a4, s0, 16 +; RV32I-NEXT: lui t1, 16 +; RV32I-NEXT: srli a7, s0, 24 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t5, a0, 24 +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: srli s2, t4, 24 +; RV32I-NEXT: srli t2, t3, 16 +; RV32I-NEXT: srli s3, t3, 24 +; RV32I-NEXT: srli s1, a1, 16 +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: srli t6, t0, 16 +; RV32I-NEXT: srli s6, t0, 24 +; RV32I-NEXT: srli s5, s11, 16 +; RV32I-NEXT: srli s4, s11, 24 +; RV32I-NEXT: srli s7, ra, 16 +; RV32I-NEXT: srli s8, ra, 24 +; RV32I-NEXT: addi t1, t1, -1 +; RV32I-NEXT: and s9, s0, t1 +; RV32I-NEXT: and s10, a0, t1 +; RV32I-NEXT: srli s9, s9, 8 +; RV32I-NEXT: sb s0, 0(a2) +; RV32I-NEXT: sb s9, 1(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a7, 3(a2) +; RV32I-NEXT: and a4, t4, t1 +; RV32I-NEXT: srli a7, s10, 8 +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: sb t5, 7(a2) +; RV32I-NEXT: and a0, t3, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb t4, 8(a2) +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: sb a6, 10(a2) +; RV32I-NEXT: sb s2, 11(a2) +; RV32I-NEXT: and a4, a1, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb t3, 12(a2) +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb t2, 14(a2) +; RV32I-NEXT: sb s3, 15(a2) +; RV32I-NEXT: and a0, t0, t1 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a1, 16(a2) +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: sb s1, 18(a2) +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: and a1, s11, t1 +; RV32I-NEXT: and a3, ra, t1 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb t0, 20(a2) +; RV32I-NEXT: sb a0, 21(a2) +; RV32I-NEXT: sb t6, 22(a2) +; RV32I-NEXT: sb s6, 23(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a1, 25(a2) +; RV32I-NEXT: sb s5, 26(a2) +; RV32I-NEXT: sb s4, 27(a2) +; RV32I-NEXT: sb ra, 28(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb s7, 30(a2) +; RV32I-NEXT: sb s8, 31(a2) +; RV32I-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 64(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 60(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 56(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s9, 52(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s10, 48(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s11, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 96 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB20_117: +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: mv a7, t5 +; RV32I-NEXT: bltu ra, t0, .LBB20_110 +; RV32I-NEXT: .LBB20_118: +; RV32I-NEXT: srai s1, a4, 31 +; RV32I-NEXT: mv s2, s1 +; RV32I-NEXT: bltu a5, a6, .LBB20_111 +; RV32I-NEXT: j .LBB20_112 + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} From 91ce4aa1022bcc83a6f607f4663798063cb0ab4f Mon Sep 17 00:00:00 2001 From: Luke Quinn Date: Wed, 22 Jan 2025 08:30:40 -0800 Subject: [PATCH 2/3] [RISCV] Add GISelPredicateCode TD bindings and hasAllNBitUsers prototypes for staging adding full support Signed-off-by: Luke Quinn --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 18 ++++++++++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfo.td | 8 ++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 6a42fdf3c3567..8dabda15a04a3 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -57,6 +57,20 @@ class RISCVInstructionSelector : public InstructionSelector { const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) const; + static constexpr unsigned MaxRecursionDepth = 6; + + bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, + const unsigned Depth = 0) const; + bool hasAllBUsers(const MachineInstr &MI) const { + return hasAllNBitUsers(MI, 8); + } + bool hasAllHUsers(const MachineInstr &MI) const { + return hasAllNBitUsers(MI, 16); + } + bool hasAllWUsers(const MachineInstr &MI) const { + return hasAllNBitUsers(MI, 32); + } + bool isRegInGprb(Register Reg) const; bool isRegInFprb(Register Reg) const; @@ -184,6 +198,10 @@ RISCVInstructionSelector::RISCVInstructionSelector( { } +bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const { + return false; +}; + InstructionSelector::ComplexRendererFns RISCVInstructionSelector::selectShiftMask(MachineOperand &Root, unsigned ShiftWidth) const { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index fec10864f95dc..641112ee57472 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1949,7 +1949,9 @@ class binop_allhusers : PatFrag<(ops node:$lhs, node:$rhs), (XLenVT (operator node:$lhs, node:$rhs)), [{ return hasAllHUsers(Node); -}]>; +}]> { + let GISelPredicateCode = [{ return hasAllHUsers(MI); }]; +} // PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl // if only the lower 32 bits of their result is used. @@ -1957,7 +1959,9 @@ class binop_allwusers : PatFrag<(ops node:$lhs, node:$rhs), (i64 (operator node:$lhs, node:$rhs)), [{ return hasAllWUsers(Node); -}]>; +}]> { + let GISelPredicateCode = [{ return hasAllWUsers(MI); }]; +} def sexti32_allwusers : PatFrag<(ops node:$src), (sext_inreg node:$src, i32), [{ From 6ea2ccd1648fed2c02ebd380c018b48aa6441615 Mon Sep 17 00:00:00 2001 From: Luke Quinn Date: Tue, 4 Feb 2025 10:23:34 -0800 Subject: [PATCH 3/3] [RISCV] Add hasAllNBitUsers Functional change, change allows for generation of packw instructions along with other generic instructions with narrow w types. The Optimization pass was reduced from ISEL for testing coverage Signed-off-by: Luke Quinn Signed-off-by: Luke Quinn --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 76 +++++++++++- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 6 +- llvm/test/CodeGen/RISCV/GlobalISel/combine.ll | 2 +- .../RISCV/GlobalISel/div-by-constant.ll | 6 +- .../CodeGen/RISCV/GlobalISel/rotl-rotr.ll | 116 +++++++++--------- .../CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll | 16 +-- llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 76 ++++++------ .../test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll | 15 +-- llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll | 14 +-- ...lar-shift-by-byte-multiple-legalization.ll | 66 +++++----- 10 files changed, 226 insertions(+), 167 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 8dabda15a04a3..d5d422226281b 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -61,9 +61,6 @@ class RISCVInstructionSelector : public InstructionSelector { bool hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth = 0) const; - bool hasAllBUsers(const MachineInstr &MI) const { - return hasAllNBitUsers(MI, 8); - } bool hasAllHUsers(const MachineInstr &MI) const { return hasAllNBitUsers(MI, 16); } @@ -198,9 +195,78 @@ RISCVInstructionSelector::RISCVInstructionSelector( { } -bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, unsigned Bits, const unsigned Depth) const { +// Mimics optimizations in ISel and RISCVOptWInst Pass +bool RISCVInstructionSelector::hasAllNBitUsers(const MachineInstr &MI, + unsigned Bits, + const unsigned Depth) const { + + assert((MI.getOpcode() == TargetOpcode::G_ADD || + MI.getOpcode() == TargetOpcode::G_SUB || + MI.getOpcode() == TargetOpcode::G_MUL || + MI.getOpcode() == TargetOpcode::G_SHL || + MI.getOpcode() == TargetOpcode::G_LSHR || + MI.getOpcode() == TargetOpcode::G_AND || + MI.getOpcode() == TargetOpcode::G_OR || + MI.getOpcode() == TargetOpcode::G_XOR || + MI.getOpcode() == TargetOpcode::G_SEXT_INREG || Depth != 0) && + "Unexpected opcode"); + + if (Depth >= RISCVInstructionSelector::MaxRecursionDepth) return false; -}; + + auto DestReg = MI.getOperand(0).getReg(); + for (auto &UserOp : MRI->use_nodbg_operands(DestReg)) { + assert(UserOp.getParent() && "UserOp must have a parent"); + const MachineInstr &UserMI = *UserOp.getParent(); + unsigned OpIdx = UserOp.getOperandNo(); + + switch (UserMI.getOpcode()) { + default: + return false; + case RISCV::ADDW: + case RISCV::ADDIW: + case RISCV::SUBW: + if (Bits >= 32) + break; + return false; + case RISCV::SLL: + case RISCV::SRA: + case RISCV::SRL: + // Shift amount operands only use log2(Xlen) bits. + if (OpIdx == 2 && Bits >= Log2_32(Subtarget->getXLen())) + break; + return false; + case RISCV::SLLI: + // SLLI only uses the lower (XLen - ShAmt) bits. + if (Bits >= Subtarget->getXLen() - UserMI.getOperand(2).getImm()) + break; + return false; + case RISCV::ANDI: + if (Bits >= (unsigned)llvm::bit_width( + (uint64_t)UserMI.getOperand(2).getImm())) + break; + goto RecCheck; + case RISCV::AND: + case RISCV::OR: + case RISCV::XOR: + RecCheck: + if (hasAllNBitUsers(UserMI, Bits, Depth + 1)) + break; + return false; + case RISCV::SRLI: { + unsigned ShAmt = UserMI.getOperand(2).getImm(); + // If we are shifting right by less than Bits, and users don't demand any + // bits that were shifted into [Bits-1:0], then we can consider this as an + // N-Bit user. + if (Bits > ShAmt && hasAllNBitUsers(UserMI, Bits - ShAmt, Depth + 1)) + break; + return false; + } + } + } + + return true; +} InstructionSelector::ComplexRendererFns RISCVInstructionSelector::selectShiftMask(MachineOperand &Root, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 641112ee57472..54fee1ac3130e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1950,14 +1950,14 @@ class binop_allhusers (XLenVT (operator node:$lhs, node:$rhs)), [{ return hasAllHUsers(Node); }]> { - let GISelPredicateCode = [{ return hasAllHUsers(MI); }]; + let GISelPredicateCode = [{ return hasAllHUsers(MI); }]; } // PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl // if only the lower 32 bits of their result is used. class binop_allwusers - : PatFrag<(ops node:$lhs, node:$rhs), - (i64 (operator node:$lhs, node:$rhs)), [{ + : PatFrag<(ops node:$lhs, node:$rhs), (i64 (operator node:$lhs, node:$rhs)), + [{ return hasAllWUsers(Node); }]> { let GISelPredicateCode = [{ return hasAllWUsers(MI); }]; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll index 360e84d37ec85..61d1fa5a5b9f4 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll @@ -20,7 +20,7 @@ define i32 @constant_to_rhs(i32 %x) { ; RV64-O0: # %bb.0: ; RV64-O0-NEXT: mv a1, a0 ; RV64-O0-NEXT: li a0, 1 -; RV64-O0-NEXT: add a0, a0, a1 +; RV64-O0-NEXT: addw a0, a0, a1 ; RV64-O0-NEXT: sext.w a0, a0 ; RV64-O0-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index e3616a79add9f..f62902cdd14d9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IM-NEXT: srli a2, a2, 32 ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 1 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 2 @@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IMZB-NEXT: zext.w a2, a0 ; RV64IMZB-NEXT: mul a1, a2, a1 ; RV64IMZB-NEXT: srli a1, a1, 32 -; RV64IMZB-NEXT: sub a0, a0, a1 +; RV64IMZB-NEXT: subw a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 1 ; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 2 @@ -265,7 +265,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64-NEXT: andi a2, a0, 255 ; RV64-NEXT: mul a1, a2, a1 ; RV64-NEXT: srli a1, a1, 8 -; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: subw a0, a0, a1 ; RV64-NEXT: andi a0, a0, 255 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: add a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll index 46d1661983c6a..8a786fc9993d2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll @@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: negw a2, a1 ; RV64ZBB-NEXT: sllw a1, a0, a1 ; RV64ZBB-NEXT: srlw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: negw a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sllw a2, a0, a1 -; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: negw a1, a1 ; RV64ZBB-NEXT: srlw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: negw a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: negw a2, a1 ; RV64ZBB-NEXT: srlw a1, a0, a1 ; RV64ZBB-NEXT: sllw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: negw a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srlw a2, a0, a1 -; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: negw a1, a1 ; RV64ZBB-NEXT: sllw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: negw a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: negw a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotl_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: negw a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sll a2, a0, a1 -; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: negw a1, a1 ; RV64ZBB-NEXT: srl a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: negw a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: neg a2, a1 +; RV64ZBB-NEXT: negw a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: neg a2, a1 +; RV64XTHEADBB-NEXT: negw a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotr_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: negw a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srl a2, a0, a1 -; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: negw a1, a1 ; RV64ZBB-NEXT: sll a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: neg a1, a1 +; RV64XTHEADBB-NEXT: negw a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 ; RV64I-NEXT: sllw a2, a1, a2 -; RV64I-NEXT: neg a5, a3 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: srlw a0, a0, a5 ; RV64I-NEXT: srlw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 -; RV64XTHEADBB-NEXT: neg a5, a3 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a5, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a5 ; RV64XTHEADBB-NEXT: srlw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 ; RV64I-NEXT: sll a2, a1, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: srl a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: srl a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 ; RV64I-NEXT: srlw a2, a1, a2 -; RV64I-NEXT: neg a5, a3 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: sllw a0, a0, a5 ; RV64I-NEXT: sllw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 -; RV64XTHEADBB-NEXT: neg a5, a3 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a5, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a5 ; RV64XTHEADBB-NEXT: sllw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 ; RV64I-NEXT: srl a2, a1, a2 -; RV64I-NEXT: neg a3, a3 +; RV64I-NEXT: negw a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: sll a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 -; RV64XTHEADBB-NEXT: neg a3, a3 +; RV64XTHEADBB-NEXT: negw a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: sll a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: subw a2, a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: sub a2, a2, a1 +; RV64ZBB-NEXT: subw a2, a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: sub a2, a2, a1 +; RV64XTHEADBB-NEXT: subw a2, a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: subw a2, a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: sub a2, a2, a1 +; RV64ZBB-NEXT: subw a2, a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: sub a2, a2, a1 +; RV64XTHEADBB-NEXT: subw a2, a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll index a29219bfde06b..79d08772e8853 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll @@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32) define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: rol_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: rol_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a3, a1 +; RV64I-NEXT: negw a3, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: neg a2, a0 +; RV64I-NEXT: negw a2, a0 ; RV64I-NEXT: sllw a0, a1, a0 ; RV64I-NEXT: srlw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: rol_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32) define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: ror_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: ror_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a3, a1 +; RV64I-NEXT: negw a3, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: neg a2, a0 +; RV64I-NEXT: negw a2, a0 ; RV64I-NEXT: srlw a0, a1, a0 ; RV64I-NEXT: sllw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: ror_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 9df319e73a11a..9a6c718703a27 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -31,13 +31,13 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -88,13 +88,13 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: subw a0, a1, a0 ; RV64I-NEXT: j .LBB1_3 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: li a0, 32 @@ -153,13 +153,13 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: subw a1, a1, a0 ; RV64I-NEXT: .LBB2_2: # %cond.end ; RV64I-NEXT: subw a0, s0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -212,13 +212,13 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -283,13 +283,13 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -412,13 +412,13 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -455,13 +455,13 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -497,13 +497,13 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -553,13 +553,13 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -672,13 +672,13 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -709,13 +709,13 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 @@ -750,13 +750,13 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sraiw a1, a0, 4 ; RV64I-NEXT: addw a0, a1, a0 ; RV64I-NEXT: lui a1, 4112 ; RV64I-NEXT: addiw a2, a2, -241 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index bf430c618afca..558424b53be95 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -16,9 +16,7 @@ define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind { ; ; RV64ZBKB-LABEL: pack_i32: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: zext.h a0, a0 -; RV64ZBKB-NEXT: slliw a1, a1, 16 -; RV64ZBKB-NEXT: or a0, a1, a0 +; RV64ZBKB-NEXT: packw a0, a0, a1 ; RV64ZBKB-NEXT: ret %shl = and i32 %a, 65535 %shl1 = shl i32 %b, 16 @@ -37,9 +35,7 @@ define signext i32 @pack_i32_2(i16 zeroext %a, i16 zeroext %b) nounwind { ; ; RV64ZBKB-LABEL: pack_i32_2: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: slli a1, a1, 16 -; RV64ZBKB-NEXT: or a0, a1, a0 -; RV64ZBKB-NEXT: sext.w a0, a0 +; RV64ZBKB-NEXT: packw a0, a0, a1 ; RV64ZBKB-NEXT: ret %zexta = zext i16 %a to i32 %zextb = zext i16 %b to i32 @@ -60,8 +56,7 @@ define signext i32 @pack_i32_3(i16 zeroext %0, i16 zeroext %1, i32 signext %2) { ; ; RV64ZBKB-LABEL: pack_i32_3: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: slli a0, a0, 16 -; RV64ZBKB-NEXT: or a0, a0, a1 +; RV64ZBKB-NEXT: packw a0, a1, a0 ; RV64ZBKB-NEXT: addw a0, a0, a2 ; RV64ZBKB-NEXT: ret %4 = zext i16 %0 to i32 @@ -343,9 +338,7 @@ define signext i32 @pack_i32_allWUsers(i16 zeroext %0, i16 zeroext %1, i16 zeroe ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: add a0, a1, a0 ; RV64ZBKB-NEXT: zext.h a0, a0 -; RV64ZBKB-NEXT: slli a0, a0, 16 -; RV64ZBKB-NEXT: or a0, a0, a2 -; RV64ZBKB-NEXT: sext.w a0, a0 +; RV64ZBKB-NEXT: packw a0, a2, a0 ; RV64ZBKB-NEXT: ret %4 = add i16 %1, %0 %5 = zext i16 %4 to i32 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll index 71a5ecc77a1b0..8b262db56ccd2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll @@ -330,7 +330,7 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a4, a2, a3 +; RV64I-NEXT: subw a4, a2, a3 ; RV64I-NEXT: srl a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 @@ -476,7 +476,7 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a4, a2, a3 +; RV64I-NEXT: subw a4, a2, a3 ; RV64I-NEXT: sra a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 @@ -615,7 +615,7 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: bltu a2, a4, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a0, 0 -; RV64I-NEXT: sub a4, a2, a4 +; RV64I-NEXT: subw a4, a2, a4 ; RV64I-NEXT: sll a3, a3, a4 ; RV64I-NEXT: bnez a2, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 @@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; ; RV64I-LABEL: fshr64_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: neg a2, a1 +; RV64I-NEXT: negw a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -914,7 +914,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: li a4, 64 ; RV64I-NEXT: bltu a5, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a3, a5, a4 +; RV64I-NEXT: subw a3, a5, a4 ; RV64I-NEXT: srl a6, a1, a3 ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: @@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: # %bb.4: ; RV64I-NEXT: mv a3, a6 ; RV64I-NEXT: .LBB10_5: -; RV64I-NEXT: neg a7, a2 +; RV64I-NEXT: negw a7, a2 ; RV64I-NEXT: bltu a5, a4, .LBB10_7 ; RV64I-NEXT: # %bb.6: ; RV64I-NEXT: li a2, 0 @@ -940,7 +940,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: bltu a6, a4, .LBB10_10 ; RV64I-NEXT: # %bb.9: ; RV64I-NEXT: li a5, 0 -; RV64I-NEXT: sub a4, a6, a4 +; RV64I-NEXT: subw a4, a6, a4 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: bnez a6, .LBB10_11 ; RV64I-NEXT: j .LBB10_12 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll index 4ede693242898..bc002fee4417c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -758,7 +758,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: subw a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 @@ -1091,7 +1091,7 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: subw a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 @@ -1425,7 +1425,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu a3, a5, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: sub a5, a3, a5 +; RV64I-NEXT: subw a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 @@ -1754,7 +1754,7 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu a3, a5, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: sub a5, a3, a5 +; RV64I-NEXT: subw a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB9_3 ; RV64I-NEXT: j .LBB9_4 @@ -2083,7 +2083,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: subw a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB10_3 ; RV64I-NEXT: j .LBB10_4 @@ -2416,7 +2416,7 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a5, a1, a4 +; RV64I-NEXT: subw a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB11_3 ; RV64I-NEXT: j .LBB11_4 @@ -2796,7 +2796,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 3 -; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB12_2 @@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, a7, .LBB12_12 ; RV64I-NEXT: .LBB12_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: subw t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB12_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -2862,7 +2862,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB12_17: ; RV64I-NEXT: bltu s0, a7, .LBB12_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: subw t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB12_20 ; RV64I-NEXT: j .LBB12_21 @@ -3720,7 +3720,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 5 -; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB13_2 @@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, a7, .LBB13_12 ; RV64I-NEXT: .LBB13_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: subw t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB13_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -3786,7 +3786,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB13_17: ; RV64I-NEXT: bltu s0, a7, .LBB13_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: subw t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB13_20 ; RV64I-NEXT: j .LBB13_21 @@ -4644,7 +4644,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 6 -; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB14_2 @@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, a7, .LBB14_12 ; RV64I-NEXT: .LBB14_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t3, t6, a7 +; RV64I-NEXT: subw t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB14_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -4710,7 +4710,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB14_17: ; RV64I-NEXT: bltu s0, a7, .LBB14_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, a7 +; RV64I-NEXT: subw t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB14_20 ; RV64I-NEXT: j .LBB14_21 @@ -5542,7 +5542,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: subw t2, a6, t0 ; RV64I-NEXT: negw t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB15_2 @@ -5585,7 +5585,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB15_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: subw s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB15_8 ; RV64I-NEXT: .LBB15_7: @@ -5637,7 +5637,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu s0, t0, .LBB15_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: subw t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB15_21 ; RV64I-NEXT: j .LBB15_22 @@ -6456,7 +6456,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: subw t2, a6, t0 ; RV64I-NEXT: negw t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB16_2 @@ -6499,7 +6499,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB16_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: subw s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB16_8 ; RV64I-NEXT: .LBB16_7: @@ -6551,7 +6551,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu s0, t0, .LBB16_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: subw t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB16_21 ; RV64I-NEXT: j .LBB16_22 @@ -7370,7 +7370,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: subw t2, a6, t0 ; RV64I-NEXT: negw t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB17_2 @@ -7413,7 +7413,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB17_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: sub s0, a4, t0 +; RV64I-NEXT: subw s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB17_8 ; RV64I-NEXT: .LBB17_7: @@ -7465,7 +7465,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: bltu s0, t0, .LBB17_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: sub t0, s0, t0 +; RV64I-NEXT: subw t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB17_21 ; RV64I-NEXT: j .LBB17_22 @@ -8310,7 +8310,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB18_2 @@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, t0, .LBB18_12 ; RV64I-NEXT: .LBB18_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: subw t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB18_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -8376,7 +8376,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB18_17: ; RV64I-NEXT: bltu s0, t0, .LBB18_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: subw t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB18_20 ; RV64I-NEXT: j .LBB18_21 @@ -9241,7 +9241,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB19_2 @@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, t0, .LBB19_12 ; RV64I-NEXT: .LBB19_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: subw t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB19_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -9307,7 +9307,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB19_17: ; RV64I-NEXT: bltu s0, t0, .LBB19_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: subw t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB19_20 ; RV64I-NEXT: j .LBB19_21 @@ -10172,7 +10172,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB20_2 @@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, t0, .LBB20_12 ; RV64I-NEXT: .LBB20_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: sub t4, t6, t0 +; RV64I-NEXT: subw t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB20_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -10238,7 +10238,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB20_17: ; RV64I-NEXT: bltu s0, t0, .LBB20_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: sub t6, s0, t0 +; RV64I-NEXT: subw t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB20_20 ; RV64I-NEXT: j .LBB20_21