From f16c67737bcfd6beb852422e835af6df2f35ad34 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Mon, 24 Mar 2025 22:54:41 +0000 Subject: [PATCH] [SDAG][tests] add some test cases covering an add-based rotate --- llvm/test/CodeGen/AMDGPU/rotate-add.ll | 328 ++++++++++++++++++++++ llvm/test/CodeGen/ARM/rotate-add.ll | 203 ++++++++++++++ llvm/test/CodeGen/NVPTX/rotate-add.ll | 259 ++++++++++++++++++ llvm/test/CodeGen/X86/rotate-add.ll | 358 +++++++++++++++++++++++++ 4 files changed, 1148 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/rotate-add.ll create mode 100644 llvm/test/CodeGen/ARM/rotate-add.ll create mode 100644 llvm/test/CodeGen/NVPTX/rotate-add.ll create mode 100644 llvm/test/CodeGen/X86/rotate-add.ll diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll new file mode 100644 index 0000000000000..faf89f41bdf86 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s + +target triple = "nvptx64-nvidia-cuda" + +define i32 @test_simple_rotl(i32 %x) { +; SI-LABEL: test_simple_rotl: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 25 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_simple_rotl: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_alignbit_b32 v0, v0, v0, 25 +; VI-NEXT: s_setpc_b64 s[30:31] + %shl = shl i32 %x, 7 + %shr = lshr i32 %x, 25 + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_simple_rotr(i32 %x) { +; SI-LABEL: test_simple_rotr: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_simple_rotr: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_alignbit_b32 v0, v0, v0, 7 +; VI-NEXT: s_setpc_b64 s[30:31] + %shr = lshr i32 %x, 7 + %shl = shl i32 %x, 25 + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_rotl_var(i32 %x, i32 %y) { +; SI-LABEL: test_rotl_var: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_rotl_var: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %shl = shl i32 %x, %y + %sub = sub i32 32, %y + %shr = lshr i32 %x, %sub + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_rotr_var(i32 %x, i32 %y) { +; SI-LABEL: test_rotr_var: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_rotr_var: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %shr = lshr i32 %x, %y + %sub = sub i32 32, %y + %shl = shl i32 %x, %sub + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) { +; SI-LABEL: test_invalid_rotl_var_and: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_invalid_rotl_var_and: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] + %shr = shl i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = lshr i32 %x, %and + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { +; SI-LABEL: test_invalid_rotr_var_and: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_invalid_rotr_var_and: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %shr = lshr i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = shl i32 %x, %and + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { +; SI-LABEL: test_fshl_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; SI-NEXT: v_xor_b32_e32 v2, 31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fshl_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; VI-NEXT: v_xor_b32_e32 v2, 31, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] + %shl = shl i32 %x0, %y + %srli = lshr i32 %x1, 1 + %x = xor i32 %y, 31 + %srlo = lshr i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { +; SI-LABEL: test_fshr_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_xor_b32_e32 v2, 31, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fshr_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: v_xor_b32_e32 v2, 31, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %shl = lshr i32 %x1, %y + %srli = shl i32 %x0, 1 + %x = xor i32 %y, 31 + %srlo = shl i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i64 @test_rotl_udiv_special_case(i64 %i) { +; SI-LABEL: test_rotl_udiv_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; SI-NEXT: v_mul_hi_u32 v2, v0, s4 +; SI-NEXT: v_mul_lo_u32 v3, v0, s4 +; SI-NEXT: v_mul_hi_u32 v4, v1, s5 +; SI-NEXT: v_mul_lo_u32 v5, v1, s5 +; SI-NEXT: v_mul_hi_u32 v0, v0, s5 +; SI-NEXT: v_mul_hi_u32 v6, v1, s4 +; SI-NEXT: v_mul_lo_u32 v1, v1, s4 +; SI-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 +; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 27, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xf0000000, v0 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_alignbit_b32 v0, v3, v2, 5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_rotl_udiv_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; VI-NEXT: v_mul_hi_u32 v2, v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s6, 0xaaaaaaaa +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v4 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] +; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v3 +; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, s6, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[2:3], 5, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 27, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 5 +; VI-NEXT: v_and_b32_e32 v1, 0xf0000000, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: s_setpc_b64 s[30:31] + %lhs_div = udiv i64 %i, 3 + %rhs_div = udiv i64 %i, 48 + %lhs_shift = shl i64 %lhs_div, 60 + %out = add i64 %lhs_shift, %rhs_div + ret i64 %out +} + +define i32 @test_rotl_mul_special_case(i32 %i) { +; SI-LABEL: test_rotl_mul_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v0, v0, 9 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 25 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_rotl_mul_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_lo_u32 v0, v0, 9 +; VI-NEXT: v_alignbit_b32 v0, v0, v0, 25 +; VI-NEXT: s_setpc_b64 s[30:31] + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = add i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { +; SI-LABEL: test_rotl_mul_with_mask_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v1, v1, 9 +; SI-NEXT: v_mul_hi_u32 v2, v0, 9 +; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 25 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_rotl_mul_with_mask_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_lo_u32 v1, v1, 9 +; VI-NEXT: v_mul_hi_u32 v2, v0, 9 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 25 +; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_setpc_b64 s[30:31] + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = add i64 %lhs_and, %rhs_shift + ret i64 %out +} + +define i32 @test_fshl_with_mask_special_case(i32 %x) { +; SI-LABEL: test_fshl_with_mask_special_case: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, 1, v0 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 27 +; SI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: test_fshl_with_mask_special_case: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, 1, v0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 27 +; VI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %or1 = or i32 %x, 1 + %sh1 = shl i32 %or1, 5 + %sh2 = lshr i32 %x, 27 + %1 = and i32 %sh2, 1 + %r = add i32 %sh1, %1 + ret i32 %r +} diff --git a/llvm/test/CodeGen/ARM/rotate-add.ll b/llvm/test/CodeGen/ARM/rotate-add.ll new file mode 100644 index 0000000000000..9325e8b062dda --- /dev/null +++ b/llvm/test/CodeGen/ARM/rotate-add.ll @@ -0,0 +1,203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s + + +target triple = "nvptx64-nvidia-cuda" + +define i32 @test_simple_rotl(i32 %x) { +; CHECK-LABEL: test_simple_rotl: +; CHECK: @ %bb.0: +; CHECK-NEXT: ror r0, r0, #25 +; CHECK-NEXT: bx lr + %shl = shl i32 %x, 7 + %shr = lshr i32 %x, 25 + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_simple_rotr(i32 %x) { +; CHECK-LABEL: test_simple_rotr: +; CHECK: @ %bb.0: +; CHECK-NEXT: ror r0, r0, #7 +; CHECK-NEXT: bx lr + %shr = lshr i32 %x, 7 + %shl = shl i32 %x, 25 + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_rotl_var(i32 %x, i32 %y) { +; CHECK-LABEL: test_rotl_var: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsl r2, r0, r1 +; CHECK-NEXT: rsb r1, r1, #32 +; CHECK-NEXT: add r0, r2, r0, lsr r1 +; CHECK-NEXT: bx lr + %shl = shl i32 %x, %y + %sub = sub i32 32, %y + %shr = lshr i32 %x, %sub + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_rotr_var(i32 %x, i32 %y) { +; CHECK-LABEL: test_rotr_var: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsr r2, r0, r1 +; CHECK-NEXT: rsb r1, r1, #32 +; CHECK-NEXT: add r0, r2, r0, lsl r1 +; CHECK-NEXT: bx lr + %shr = lshr i32 %x, %y + %sub = sub i32 32, %y + %shl = shl i32 %x, %sub + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) { +; CHECK-LABEL: test_invalid_rotl_var_and: +; CHECK: @ %bb.0: +; CHECK-NEXT: rsb r2, r1, #0 +; CHECK-NEXT: and r2, r2, #31 +; CHECK-NEXT: lsr r2, r0, r2 +; CHECK-NEXT: add r0, r2, r0, lsl r1 +; CHECK-NEXT: bx lr + %shr = shl i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = lshr i32 %x, %and + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { +; CHECK-LABEL: test_invalid_rotr_var_and: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsr r2, r0, r1 +; CHECK-NEXT: rsb r1, r1, #0 +; CHECK-NEXT: and r1, r1, #31 +; CHECK-NEXT: add r0, r2, r0, lsl r1 +; CHECK-NEXT: bx lr + %shr = lshr i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = shl i32 %x, %and + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { +; CHECK-LABEL: test_fshl_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsl r0, r0, r2 +; CHECK-NEXT: eor r2, r2, #31 +; CHECK-NEXT: lsr r1, r1, #1 +; CHECK-NEXT: add r0, r0, r1, lsr r2 +; CHECK-NEXT: bx lr + %shl = shl i32 %x0, %y + %srli = lshr i32 %x1, 1 + %x = xor i32 %y, 31 + %srlo = lshr i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { +; CHECK-LABEL: test_fshr_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: lsr r1, r1, r2 +; CHECK-NEXT: eor r2, r2, #31 +; CHECK-NEXT: lsl r0, r0, #1 +; CHECK-NEXT: add r0, r1, r0, lsl r2 +; CHECK-NEXT: bx lr + %shl = lshr i32 %x1, %y + %srli = shl i32 %x0, 1 + %x = xor i32 %y, 31 + %srlo = shl i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i64 @test_rotl_udiv_special_case(i64 %i) { +; CHECK-LABEL: test_rotl_udiv_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: adds r2, r0, r1 +; CHECK-NEXT: ldr r12, .LCPI8_0 +; CHECK-NEXT: adc lr, r2, #0 +; CHECK-NEXT: umull r3, r2, lr, r12 +; CHECK-NEXT: bic r3, r2, #1 +; CHECK-NEXT: add r2, r3, r2, lsr #1 +; CHECK-NEXT: ldr r3, .LCPI8_1 +; CHECK-NEXT: sub r2, lr, r2 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbc r1, r1, #0 +; CHECK-NEXT: umull r2, lr, r0, r12 +; CHECK-NEXT: mla r0, r0, r3, lr +; CHECK-NEXT: mla r0, r1, r12, r0 +; CHECK-NEXT: lsl r1, r2, #28 +; CHECK-NEXT: orr r1, r1, r0, lsr #4 +; CHECK-NEXT: lsl r0, r0, #28 +; CHECK-NEXT: orr r0, r0, r2, lsr #4 +; CHECK-NEXT: pop {r11, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 2863311531 @ 0xaaaaaaab +; CHECK-NEXT: .LCPI8_1: +; CHECK-NEXT: .long 2863311530 @ 0xaaaaaaaa + %lhs_div = udiv i64 %i, 3 + %rhs_div = udiv i64 %i, 48 + %lhs_shift = shl i64 %lhs_div, 60 + %out = add i64 %lhs_shift, %rhs_div + ret i64 %out +} + +define i32 @test_rotl_mul_special_case(i32 %i) { +; CHECK-LABEL: test_rotl_mul_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, r0, lsl #3 +; CHECK-NEXT: ror r0, r0, #25 +; CHECK-NEXT: bx lr + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = add i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { +; CHECK-LABEL: test_rotl_mul_with_mask_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r2, #9 +; CHECK-NEXT: add r1, r1, r1, lsl #3 +; CHECK-NEXT: umull r2, r3, r0, r2 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: lsl r0, r0, #7 +; CHECK-NEXT: add r1, r3, r1 +; CHECK-NEXT: orr r0, r0, r1, lsr #25 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: bx lr + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = add i64 %lhs_and, %rhs_shift + ret i64 %out +} + +define i32 @test_fshl_with_mask_special_case(i32 %x) { +; CHECK-LABEL: test_fshl_with_mask_special_case: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #32 +; CHECK-NEXT: orr r0, r1, r0, ror #27 +; CHECK-NEXT: bic r0, r0, #30 +; CHECK-NEXT: bx lr + %or1 = or i32 %x, 1 + %sh1 = shl i32 %or1, 5 + %sh2 = lshr i32 %x, 27 + %1 = and i32 %sh2, 1 + %r = add i32 %sh1, %1 + ret i32 %r +} diff --git a/llvm/test/CodeGen/NVPTX/rotate-add.ll b/llvm/test/CodeGen/NVPTX/rotate-add.ll new file mode 100644 index 0000000000000..c79a95958eca2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/rotate-add.ll @@ -0,0 +1,259 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_50 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i32 @test_simple_rotl(i32 %x) { +; CHECK-LABEL: test_simple_rotl( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_simple_rotl_param_0]; +; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %shl = shl i32 %x, 7 + %shr = lshr i32 %x, 25 + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_simple_rotr(i32 %x) { +; CHECK-LABEL: test_simple_rotr( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_simple_rotr_param_0]; +; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 25; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %shr = lshr i32 %x, 7 + %shl = shl i32 %x, 25 + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_rotl_var(i32 %x, i32 %y) { +; CHECK-LABEL: test_rotl_var( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_param_1]; +; CHECK-NEXT: shl.b32 %r3, %r1, %r2; +; CHECK-NEXT: sub.s32 %r4, 32, %r2; +; CHECK-NEXT: shr.u32 %r5, %r1, %r4; +; CHECK-NEXT: add.s32 %r6, %r3, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %shl = shl i32 %x, %y + %sub = sub i32 32, %y + %shr = lshr i32 %x, %sub + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_rotr_var(i32 %x, i32 %y) { +; CHECK-LABEL: test_rotr_var( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_param_1]; +; CHECK-NEXT: shr.u32 %r3, %r1, %r2; +; CHECK-NEXT: sub.s32 %r4, 32, %r2; +; CHECK-NEXT: shl.b32 %r5, %r1, %r4; +; CHECK-NEXT: add.s32 %r6, %r3, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %shr = lshr i32 %x, %y + %sub = sub i32 32, %y + %shl = shl i32 %x, %sub + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) { +; CHECK-LABEL: test_invalid_rotl_var_and( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_invalid_rotl_var_and_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_invalid_rotl_var_and_param_1]; +; CHECK-NEXT: shl.b32 %r3, %r1, %r2; +; CHECK-NEXT: neg.s32 %r4, %r2; +; CHECK-NEXT: and.b32 %r5, %r4, 31; +; CHECK-NEXT: shr.u32 %r6, %r1, %r5; +; CHECK-NEXT: add.s32 %r7, %r6, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; + %shr = shl i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = lshr i32 %x, %and + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { +; CHECK-LABEL: test_invalid_rotr_var_and( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_invalid_rotr_var_and_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_invalid_rotr_var_and_param_1]; +; CHECK-NEXT: shr.u32 %r3, %r1, %r2; +; CHECK-NEXT: neg.s32 %r4, %r2; +; CHECK-NEXT: and.b32 %r5, %r4, 31; +; CHECK-NEXT: shl.b32 %r6, %r1, %r5; +; CHECK-NEXT: add.s32 %r7, %r3, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; + %shr = lshr i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = shl i32 %x, %and + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { +; CHECK-LABEL: test_fshl_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_special_case_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_2]; +; CHECK-NEXT: shl.b32 %r3, %r1, %r2; +; CHECK-NEXT: ld.param.u32 %r4, [test_fshl_special_case_param_1]; +; CHECK-NEXT: shr.u32 %r5, %r4, 1; +; CHECK-NEXT: xor.b32 %r6, %r2, 31; +; CHECK-NEXT: shr.u32 %r7, %r5, %r6; +; CHECK-NEXT: add.s32 %r8, %r3, %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; + %shl = shl i32 %x0, %y + %srli = lshr i32 %x1, 1 + %x = xor i32 %y, 31 + %srlo = lshr i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { +; CHECK-LABEL: test_fshr_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_fshr_special_case_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_fshr_special_case_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_fshr_special_case_param_2]; +; CHECK-NEXT: shr.u32 %r4, %r2, %r3; +; CHECK-NEXT: shl.b32 %r5, %r1, 1; +; CHECK-NEXT: xor.b32 %r6, %r3, 31; +; CHECK-NEXT: shl.b32 %r7, %r5, %r6; +; CHECK-NEXT: add.s32 %r8, %r4, %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; + %shl = lshr i32 %x1, %y + %srli = shl i32 %x0, 1 + %x = xor i32 %y, 31 + %srlo = shl i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i64 @test_rotl_udiv_special_case(i64 %i) { +; CHECK-LABEL: test_rotl_udiv_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_rotl_udiv_special_case_param_0]; +; CHECK-NEXT: mul.hi.u64 %rd2, %rd1, -6148914691236517205; +; CHECK-NEXT: shr.u64 %rd3, %rd2, 1; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NEXT: shf.l.wrap.b32 %r3, %r2, %r1, 28; +; CHECK-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, 28; +; CHECK-NEXT: mov.b64 %rd4, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %lhs_div = udiv i64 %i, 3 + %rhs_div = udiv i64 %i, 48 + %lhs_shift = shl i64 %lhs_div, 60 + %out = add i64 %lhs_shift, %rhs_div + ret i64 %out +} + +define i32 @test_rotl_mul_special_case(i32 %i) { +; CHECK-LABEL: test_rotl_mul_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_mul_special_case_param_0]; +; CHECK-NEXT: mul.lo.s32 %r2, %r1, 9; +; CHECK-NEXT: shf.l.wrap.b32 %r3, %r2, %r2, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = add i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { +; CHECK-LABEL: test_rotl_mul_with_mask_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_rotl_mul_with_mask_special_case_param_0]; +; CHECK-NEXT: mul.lo.s64 %rd2, %rd1, 9; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 7; +; CHECK-NEXT: shf.l.wrap.b32 %r6, %r1, %r2, 7; +; CHECK-NEXT: mov.b64 %rd3, {%r5, %r6}; +; CHECK-NEXT: and.b64 %rd4, %rd3, 255; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = add i64 %lhs_and, %rhs_shift + ret i64 %out +} + +define i32 @test_fshl_with_mask_special_case(i32 %x) { +; CHECK-LABEL: test_fshl_with_mask_special_case( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_with_mask_special_case_param_0]; +; CHECK-NEXT: or.b32 %r2, %r1, 1; +; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 5; +; CHECK-NEXT: and.b32 %r4, %r3, -31; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %or1 = or i32 %x, 1 + %sh1 = shl i32 %or1, 5 + %sh2 = lshr i32 %x, 27 + %1 = and i32 %sh2, 1 + %r = add i32 %sh1, %1 + ret i32 %r +} diff --git a/llvm/test/CodeGen/X86/rotate-add.ll b/llvm/test/CodeGen/X86/rotate-add.ll new file mode 100644 index 0000000000000..6e19fc20abf04 --- /dev/null +++ b/llvm/test/CodeGen/X86/rotate-add.ll @@ -0,0 +1,358 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i32 @test_simple_rotl(i32 %x) { +; X86-LABEL: test_simple_rotl: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: roll $7, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_simple_rotl: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: roll $7, %eax +; X64-NEXT: retq + %shl = shl i32 %x, 7 + %shr = lshr i32 %x, 25 + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_simple_rotr(i32 %x) { +; X86-LABEL: test_simple_rotr: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: roll $25, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_simple_rotr: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: roll $25, %eax +; X64-NEXT: retq + %shr = lshr i32 %x, 7 + %shl = shl i32 %x, 25 + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_rotl_var(i32 %x, i32 %y) { +; X86-LABEL: test_rotl_var: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: negb %cl +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_rotl_var: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %eax +; X64-NEXT: negb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrl %cl, %edi +; X64-NEXT: addl %edi, %eax +; X64-NEXT: retq + %shl = shl i32 %x, %y + %sub = sub i32 32, %y + %shr = lshr i32 %x, %sub + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_rotr_var(i32 %x, i32 %y) { +; X86-LABEL: test_rotr_var: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: negb %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_rotr_var: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl %cl, %eax +; X64-NEXT: negb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %edi +; X64-NEXT: addl %edi, %eax +; X64-NEXT: retq + %shr = lshr i32 %x, %y + %sub = sub i32 32, %y + %shl = shl i32 %x, %sub + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) { +; X86-LABEL: test_invalid_rotl_var_and: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: negb %cl +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_invalid_rotl_var_and: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %eax +; X64-NEXT: negb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrl %cl, %edi +; X64-NEXT: addl %edi, %eax +; X64-NEXT: retq + %shr = shl i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = lshr i32 %x, %and + %add = add i32 %shl, %shr + ret i32 %add +} + +define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { +; X86-LABEL: test_invalid_rotr_var_and: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: negb %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_invalid_rotr_var_and: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl %cl, %eax +; X64-NEXT: negb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %edi +; X64-NEXT: addl %edi, %eax +; X64-NEXT: retq + %shr = lshr i32 %x, %y + %sub = sub nsw i32 0, %y + %and = and i32 %sub, 31 + %shl = shl i32 %x, %and + %add = add i32 %shr, %shl + ret i32 %add +} + +define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { +; X86-LABEL: test_fshl_special_case: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: shrl %eax +; X86-NEXT: notb %cl +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_fshl_special_case: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: shrl %esi +; X64-NEXT: notb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrl %cl, %esi +; X64-NEXT: leal (%rsi,%rdi), %eax +; X64-NEXT: retq + %shl = shl i32 %x0, %y + %srli = lshr i32 %x1, 1 + %x = xor i32 %y, 31 + %srlo = lshr i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { +; X86-LABEL: test_fshr_special_case: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: addl %eax, %eax +; X86-NEXT: notb %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_fshr_special_case: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shrl %cl, %esi +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: notb %cl +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %eax +; X64-NEXT: addl %esi, %eax +; X64-NEXT: retq + %shl = lshr i32 %x1, %y + %srli = shl i32 %x0, 1 + %x = xor i32 %y, 31 + %srlo = shl i32 %srli, %x + %o = add i32 %shl, %srlo + ret i32 %o +} + +define i64 @test_rotl_udiv_special_case(i64 %i) { +; X86-LABEL: test_rotl_udiv_special_case: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -16 +; X86-NEXT: .cfi_offset %edi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: subl %esi, %ecx +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shldl $28, %eax, %ecx +; X86-NEXT: shrdl $4, %eax, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: test_rotl_udiv_special_case: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: rolq $60, %rax +; X64-NEXT: retq + %lhs_div = udiv i64 %i, 3 + %rhs_div = udiv i64 %i, 48 + %lhs_shift = shl i64 %lhs_div, 60 + %out = add i64 %lhs_shift, %rhs_div + ret i64 %out +} + +define i32 @test_rotl_mul_special_case(i32 %i) { +; X86-LABEL: test_rotl_mul_special_case: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: roll $7, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_rotl_mul_special_case: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: roll $7, %eax +; X64-NEXT: retq + %lhs_mul = mul i32 %i, 9 + %rhs_mul = mul i32 %i, 1152 + %lhs_shift = lshr i32 %lhs_mul, 25 + %out = add i32 %lhs_shift, %rhs_mul + ret i32 %out +} + +define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { +; X86-LABEL: test_rotl_mul_with_mask_special_case: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: movl $9, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: shrdl $25, %eax, %edx +; X86-NEXT: movzbl %dl, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_rotl_mul_with_mask_special_case: +; X64: # %bb.0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: shll $7, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shrq $57, %rax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: retq + %lhs_mul = mul i64 %i, 1152 + %rhs_mul = mul i64 %i, 9 + %lhs_and = and i64 %lhs_mul, 160 + %rhs_shift = lshr i64 %rhs_mul, 57 + %out = add i64 %lhs_and, %rhs_shift + ret i64 %out +} + +define i32 @test_fshl_with_mask_special_case(i32 %x) { +; X86-LABEL: test_fshl_with_mask_special_case: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: orl $1, %eax +; X86-NEXT: shldl $5, %ecx, %eax +; X86-NEXT: andl $-31, %eax +; X86-NEXT: retl +; +; X64-LABEL: test_fshl_with_mask_special_case: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl $1, %eax +; X64-NEXT: shldl $5, %edi, %eax +; X64-NEXT: andl $-31, %eax +; X64-NEXT: retq + %or1 = or i32 %x, 1 + %sh1 = shl i32 %or1, 5 + %sh2 = lshr i32 %x, 27 + %1 = and i32 %sh2, 1 + %r = add i32 %sh1, %1 + ret i32 %r +}