From ebb5abea905a549f0b2cc14b3d796e54520e14a4 Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Tue, 12 Aug 2025 18:05:50 +0200 Subject: [PATCH 1/4] [SelectionDAG] Add Tests for Large UDIV/UREM by Constant; NFC --- llvm/test/CodeGen/X86/divide-by-constant.ll | 164 +++++++++++++ llvm/test/CodeGen/X86/divmod128.ll | 244 ++++++++++++++++++++ 2 files changed, 408 insertions(+) diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll index ac78136b9d8ea..14bcc22880697 100644 --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -1161,6 +1161,170 @@ entry: ret i64 %rem } +; PR137514 +define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind { +; X86-LABEL: udiv_i64_magic_large_postshift: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000 +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: udiv_i64_magic_large_postshift: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $63, %rax +; X64-NEXT: retq + %ret = udiv i64 %x, 13835058055282163712 ; = 3 * 2^62 + ret i64 %ret +} + +; PR137514 +define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind { +; X86-LABEL: urem_i64_magic_large_postshift: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000 +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __umoddi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: urem_i64_magic_large_postshift: +; X64: # %bb.0: +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: movabsq $4611686018427387904, %rax # imm = 0x4000000000000000 +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: retq + %ret = urem i64 %x, 13835058055282163712 ; = 3 * 2^62 + ret i64 %ret +} + +; PR137514 +define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind { +; X86-LABEL: udiv_i64_magic_large_preshift: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $14 +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: udiv_i64_magic_large_preshift: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $33, %rax +; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: retq + %ret = udiv i64 %x, 60129542144 ; = 14 * 2^32 + ret i64 %ret +} + +; PR137514 +define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind { +; X86-LABEL: urem_i64_magic_large_preshift: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $14 +; X86-NEXT: pushl $0 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __umoddi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: urem_i64_magic_large_preshift: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $33, %rax +; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; X64-NEXT: mulq %rcx +; X64-NEXT: movabsq $60129542144, %rax # imm = 0xE00000000 +; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + %ret = urem i64 %x, 60129542144 ; = 14 * 2^32 + ret i64 %ret +} + +; PR137514 +define i64 @udiv_i64_magic_is_add(i64 %x) nounwind { +; X86-LABEL: udiv_i64_magic_is_add: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $196608 # imm = 0x30000 +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __udivdi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: udiv_i64_magic_is_add: +; X64: # %bb.0: +; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: shrq %rdi +; X64-NEXT: leaq (%rdi,%rdx), %rax +; X64-NEXT: shrq $49, %rax +; X64-NEXT: retq + %ret = udiv i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1 + ret i64 %ret +} + +; PR137514 +define i64 @urem_i64_magic_is_add(i64 %x) nounwind { +; X86-LABEL: urem_i64_magic_is_add: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl $196608 # imm = 0x30000 +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll __umoddi3 +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: urem_i64_magic_is_add: +; X64: # %bb.0: +; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: subq %rdx, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: shrq $49, %rax +; X64-NEXT: movabsq $844429225099263, %rcx # imm = 0x30000FFFFFFFF +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: subq %rcx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + %ret = urem i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1 + ret i64 %ret +} + ; Make sure we don't inline expand for optsize. define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize { ; X86-LABEL: urem_i64_3_optsize: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll index 3796dd796eaf9..9d54452404fb0 100644 --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -1013,3 +1013,247 @@ entry: %rem = urem i128 %x, 3 ret i128 %rem } + +; PR137514 +define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind { +; X86-64-LABEL: udiv_magic_preshift_and_postshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $22, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_magic_preshift_and_postshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = udiv i128 %x, 22 + ret i128 %ret +} + +; PR137514 +define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind { +; X86-64-LABEL: urem_magic_preshift_and_postshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $22, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_magic_preshift_and_postshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = urem i128 %x, 22 + ret i128 %ret +} + +; PR137514 +define i128 @udiv_magic_large_preshift(i128 %x) nounwind { +; X86-64-LABEL: udiv_magic_large_preshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000 +; X86-64-NEXT: xorl %edx, %edx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_magic_large_preshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000 +; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100 + ret i128 %ret +} + +; PR137514 +define i128 @urem_magic_large_preshift(i128 %x) nounwind { +; X86-64-LABEL: urem_magic_large_preshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000 +; X86-64-NEXT: xorl %edx, %edx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_magic_large_preshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000 +; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100 + ret i128 %ret +} + +; PR137514 +define i128 @udiv_magic_large_postshift(i128 %x) nounwind { +; X86-64-LABEL: udiv_magic_large_postshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $1, %edx +; X86-64-NEXT: movl $1, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_magic_large_postshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1 + ret i128 %ret +} + +; PR137514 +define i128 @urem_magic_large_postshift(i128 %x) nounwind { +; X86-64-LABEL: urem_magic_large_postshift: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $1, %edx +; X86-64-NEXT: movl $1, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_magic_large_postshift: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1 + ret i128 %ret +} + +; PR137514 +define i128 @udiv_magic_is_add(i128 %x) nounwind { +; X86-64-LABEL: udiv_magic_is_add: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; X86-64-NEXT: movl $1, %edx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_magic_is_add: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1 + ret i128 %ret +} + +; PR137514 +define i128 @urem_magic_is_add(i128 %x) nounwind { +; X86-64-LABEL: urem_magic_is_add: +; X86-64: # %bb.0: +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; X86-64-NEXT: movl $1, %edx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_magic_is_add: +; WIN64: # %bb.0: +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq + %ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1 + ret i128 %ret +} From 5c02eca1543e8905bd1ee154559ee2404c2f525c Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Tue, 19 Aug 2025 05:39:58 +0200 Subject: [PATCH 2/4] [SelectionDAG] Adjust Existing Tests; NFC Add new test prefixes to some tests. Currently, these prefixes are unused but a subsequent commit will change the test result such that they become necessary. Furthermore, rename tests that will be folded after a subsequent commit. --- llvm/test/CodeGen/PowerPC/urem-lkk.ll | 12 +++++++----- llvm/test/CodeGen/RISCV/urem-lkk.ll | 11 +++++------ llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 11 +++++------ 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll index 43a1e5a2faf6d..03fd0c0c7e8e2 100644 --- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck --check-prefixes=CHECK,PPC32 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck --check-prefixes=CHECK,PPC64 %s define i32 @fold_urem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_urem_positive_odd: @@ -85,9 +85,8 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) { ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: +define i64 @fold_urem_i64(i64 %x) { +; CHECK-LABEL: fold_urem_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: mflr 0 ; CHECK-NEXT: stwu 1, -16(1) @@ -104,3 +103,6 @@ define i64 @dont_fold_urem_i64(i64 %x) { %1 = urem i64 %x, 98 ret i64 %1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; PPC32: {{.*}} +; PPC64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index ee496123ba7b4..017b2d36bdd58 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -218,9 +218,8 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) nounwind { ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) nounwind { -; RV32I-LABEL: dont_fold_urem_i64: +define i64 @fold_urem_i64(i64 %x) nounwind { +; RV32I-LABEL: fold_urem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill @@ -231,7 +230,7 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM-LABEL: fold_urem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill @@ -242,12 +241,12 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind { ; RV32IM-NEXT: addi sp, sp, 16 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_i64: +; RV64I-LABEL: fold_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: tail __umoddi3 ; -; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM-LABEL: fold_urem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lui a1, %hi(.LCPI6_0) ; RV64IM-NEXT: ld a1, %lo(.LCPI6_0)(a1) diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 5a3dfd118307d..ec97e7a0ae558 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -778,9 +778,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ret <4 x i16> %1 } -; Don't fold i64 urem. -define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { -; RV32I-LABEL: dont_fold_urem_i64: +define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind { +; RV32I-LABEL: fold_urem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill @@ -850,7 +849,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: addi sp, sp, 48 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM-LABEL: fold_urem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -48 ; RV32IM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill @@ -920,7 +919,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: addi sp, sp, 48 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_i64: +; RV64I-LABEL: fold_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill @@ -956,7 +955,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM-LABEL: fold_urem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: ld a2, 8(a1) ; RV64IM-NEXT: ld a3, 16(a1) From ea788d4ad786df3cbddd837be243861819147b49 Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Tue, 15 Jul 2025 16:21:25 +0200 Subject: [PATCH 3/4] [SelectionDAG] Move UREM Decomposition to Own Function; NFC --- llvm/include/llvm/CodeGen/TargetLowering.h | 4 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 73 +++++++++++-------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4480ced637456..ae6330d8f5163 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5840,6 +5840,10 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { SDValue buildSREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; + + bool expandUDIVREMByConstantViaUREMDecomposition( + SDNode *N, APInt Divisor, SmallVectorImpl &Result, EVT HiLoVT, + SelectionDAG &DAG, SDValue LL, SDValue LH) const; }; /// Given an LLVM IR type and return type attributes, compute the return value diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 402a012e8e555..eead71270d10f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8011,25 +8011,12 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT, // dividend and multiply by the multiplicative inverse of the shifted divisor. // If we want the remainder, we shift the value left by the number of trailing // zeros and add the bits that were shifted out of the dividend. -bool TargetLowering::expandDIVREMByConstant(SDNode *N, - SmallVectorImpl &Result, - EVT HiLoVT, SelectionDAG &DAG, - SDValue LL, SDValue LH) const { +bool TargetLowering::expandUDIVREMByConstantViaUREMDecomposition( + SDNode *N, APInt Divisor, SmallVectorImpl &Result, EVT HiLoVT, + SelectionDAG &DAG, SDValue LL, SDValue LH) const { unsigned Opcode = N->getOpcode(); EVT VT = N->getValueType(0); - // TODO: Support signed division/remainder. - if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM) - return false; - assert( - (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) && - "Unexpected opcode"); - - auto *CN = dyn_cast(N->getOperand(1)); - if (!CN) - return false; - - APInt Divisor = CN->getAPIntValue(); unsigned BitWidth = Divisor.getBitWidth(); unsigned HBitWidth = BitWidth / 2; assert(VT.getScalarSizeInBits() == BitWidth && @@ -8040,20 +8027,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, if (Divisor.uge(HalfMaxPlus1)) return false; - // We depend on the UREM by constant optimization in DAGCombiner that requires - // high multiply. - if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) && - !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT)) - return false; - - // Don't expand if optimizing for size. - if (DAG.shouldOptForSize()) - return false; - - // Early out for 0 or 1 divisors. - if (Divisor.ule(1)) - return false; - // If the divisor is even, shift it until it becomes odd. unsigned TrailingZeros = 0; if (!Divisor[0]) { @@ -8164,6 +8137,46 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, return true; } +bool TargetLowering::expandDIVREMByConstant(SDNode *N, + SmallVectorImpl &Result, + EVT HiLoVT, SelectionDAG &DAG, + SDValue LL, SDValue LH) const { + unsigned Opcode = N->getOpcode(); + + // TODO: Support signed division/remainder. + if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM) + return false; + assert( + (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) && + "Unexpected opcode"); + + auto *CN = dyn_cast(N->getOperand(1)); + if (!CN) + return false; + + APInt Divisor = CN->getAPIntValue(); + + // We depend on the UREM by constant optimization in DAGCombiner that requires + // high multiply. + if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) && + !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT)) + return false; + + // Don't expand if optimizing for size. + if (DAG.shouldOptForSize()) + return false; + + // Early out for 0 or 1 divisors. + if (Divisor.ule(1)) + return false; + + if (expandUDIVREMByConstantViaUREMDecomposition(N, Divisor, Result, HiLoVT, + DAG, LL, LH)) + return true; + + return false; +} + // Check that (every element of) Z is undef or not an exact multiple of BW. static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) { return ISD::matchUnaryPredicate( From fc79e27e4843af1723ea78de6e8ac4cb292829f2 Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Tue, 12 Aug 2025 16:49:49 +0200 Subject: [PATCH 4/4] [SelectionDAG] Use Magic Algorithm for Splitting UDIV/UREM by Constant For integer types twice as large as a legal type, we have previously generated a library call if another splitting technique was not applicable. With this change, we use an adaption of the Magic algorithm. This algorithm is also used for UDIV/UREM by constants on legal types. The implementation introduced here is a simple port of the already existing implementation to types twice the size of a legal type. The core idea of this algorithm is to replace (udiv x c) for a constant c with the bits higher or equal to the s-th bit of the multiplication of x by (2^s + o)/c for some s and o. More details are available in Henry S. Warren, Jr.: "Hacker's Delight", chapter 10. An efficient handling of UDIV/UREM by constants on types twice as large as a legal type is mostly relevant for 32-bit platforms. But some projects may also benefit on 64-bit platforms. For example, the `fmt` library for C++ uses 128-bit unsigned divisions by 100 and 10000, which have not been covered by the previously existing optimizations. Closes #137514. --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 111 +++ llvm/test/CodeGen/AArch64/rem-by-const.ll | 280 ++++++-- llvm/test/CodeGen/ARM/funnel-shift.ll | 240 ++++--- llvm/test/CodeGen/Mips/funnel-shift.ll | 373 +++++----- llvm/test/CodeGen/PowerPC/funnel-shift.ll | 312 ++++---- llvm/test/CodeGen/PowerPC/urem-lkk.ll | 96 ++- llvm/test/CodeGen/RISCV/div-by-constant.ll | 49 +- .../CodeGen/RISCV/split-udiv-by-constant.ll | 172 ++++- .../CodeGen/RISCV/split-urem-by-constant.ll | 220 +++++- llvm/test/CodeGen/RISCV/urem-lkk.ll | 51 +- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 193 +++-- llvm/test/CodeGen/X86/divide-by-constant.ll | 410 +++++++++-- llvm/test/CodeGen/X86/divmod128.ll | 669 ++++++++++++++---- llvm/test/CodeGen/X86/funnel-shift.ll | 112 ++- llvm/test/CodeGen/X86/i128-udiv.ll | 46 +- 16 files changed, 2396 insertions(+), 943 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index ae6330d8f5163..76c833a7adcb5 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5844,6 +5844,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { bool expandUDIVREMByConstantViaUREMDecomposition( SDNode *N, APInt Divisor, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL, SDValue LH) const; + + bool expandUDIVREMByConstantViaUMulHiMagic(SDNode *N, const APInt &Divisor, + SmallVectorImpl &Result, + EVT HiLoVT, SelectionDAG &DAG, + SDValue LL, SDValue LH) const; }; /// Given an LLVM IR type and return type attributes, compute the return value diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index eead71270d10f..d17fe2f263452 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8137,6 +8137,113 @@ bool TargetLowering::expandUDIVREMByConstantViaUREMDecomposition( return true; } +bool TargetLowering::expandUDIVREMByConstantViaUMulHiMagic( + SDNode *N, const APInt &Divisor, SmallVectorImpl &Result, + EVT HiLoVT, SelectionDAG &DAG, SDValue LL, SDValue LH) const { + + SDValue N0 = N->getOperand(0); + EVT VT = N0->getValueType(0); + SDLoc DL{N}; + + assert(!Divisor.isOne() && "Magic algorithm does not work for division by 1"); + + // This helper creates a MUL_LOHI of the pair (LL, LH) by a constant. + auto MakeMUL_LOHIByConst = [&](unsigned Opc, SDValue LL, SDValue LH, + const APInt &Const, + SmallVectorImpl &Result) { + SDValue LHS = DAG.getNode(ISD::BUILD_PAIR, DL, VT, LL, LH); + SDValue RHS = DAG.getConstant(Const, DL, VT); + auto [RL, RH] = DAG.SplitScalar(RHS, DL, HiLoVT, HiLoVT); + return expandMUL_LOHI(Opc, VT, DL, LHS, RHS, Result, HiLoVT, DAG, + TargetLowering::MulExpansionKind::OnlyLegalOrCustom, + LL, LH, RL, RH); + }; + + // This helper creates an ADD/SUB of the pairs (LL, LH) and (RL, RH). + auto MakeAddSubLong = [&](unsigned Opc, SDValue LL, SDValue LH, SDValue RL, + SDValue RH) { + SDValue AddSubNode = + DAG.getNode(Opc == ISD::ADD ? ISD::UADDO : ISD::USUBO, DL, + DAG.getVTList(HiLoVT, MVT::i1), LL, RL); + SDValue OutL, OutH, Overflow; + expandUADDSUBO(AddSubNode.getNode(), OutL, Overflow, DAG); + SDValue WithOverflow = DAG.getNode( + Opc, DL, HiLoVT, LH, DAG.getZExtOrTrunc(Overflow, DL, HiLoVT)); + OutH = DAG.getNode(Opc, DL, HiLoVT, WithOverflow, RH); + return std::make_pair(OutL, OutH); + }; + + // This helper creates a SRL of the pair (LL, LH) by Shift. + auto MakeSRLLong = [&](SDValue LL, SDValue LH, unsigned Shift) { + unsigned HBitWidth = HiLoVT.getScalarSizeInBits(); + if (Shift < HBitWidth) { + SDValue ShAmt = DAG.getShiftAmountConstant(Shift, HiLoVT, DL); + SDValue ResL = DAG.getNode(ISD::FSHR, DL, HiLoVT, LH, LL, ShAmt); + SDValue ResH = DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt); + return std::make_pair(ResL, ResH); + } + SDValue Zero = DAG.getConstant(0, DL, HiLoVT); + if (Shift == HBitWidth) + return std::make_pair(LH, Zero); + assert(Shift - HBitWidth < HBitWidth && + "We shouldn't generate an undefined shift"); + SDValue ShAmt = DAG.getShiftAmountConstant(Shift - HBitWidth, HiLoVT, DL); + return std::make_pair(DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt), Zero); + }; + + // Knowledge of leading zeros may help to reduce the multiplier. + unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros(); + + UnsignedDivisionByConstantInfo Magics = UnsignedDivisionByConstantInfo::get( + Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero())); + + assert(!LL == !LH && "Expected both input halves or no input halves!"); + if (!LL) + std::tie(LL, LH) = DAG.SplitScalar(N0, DL, HiLoVT, HiLoVT); + SDValue QL = LL; + SDValue QH = LH; + if (Magics.PreShift != 0) + std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PreShift); + + SmallVector UMulResult; + if (!MakeMUL_LOHIByConst(ISD::UMUL_LOHI, QL, QH, Magics.Magic, UMulResult)) + return false; + + QL = UMulResult[2]; + QH = UMulResult[3]; + + if (Magics.IsAdd) { + auto [NPQL, NPQH] = MakeAddSubLong(ISD::SUB, LL, LH, QL, QH); + std::tie(NPQL, NPQH) = MakeSRLLong(NPQL, NPQH, 1); + std::tie(QL, QH) = MakeAddSubLong(ISD::ADD, NPQL, NPQH, QL, QH); + } + + if (Magics.PostShift != 0) + std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PostShift); + + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::UREM) { + Result.push_back(QL); + Result.push_back(QH); + } + + if (Opcode != ISD::UDIV) { + SmallVector MulResult; + if (!MakeMUL_LOHIByConst(ISD::MUL, QL, QH, Divisor, MulResult)) + return false; + + assert(MulResult.size() == 2); + + auto [RemL, RemH] = + MakeAddSubLong(ISD::SUB, LL, LH, MulResult[0], MulResult[1]); + + Result.push_back(RemL); + Result.push_back(RemH); + } + + return true; +} + bool TargetLowering::expandDIVREMByConstant(SDNode *N, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, @@ -8174,6 +8281,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, DAG, LL, LH)) return true; + if (expandUDIVREMByConstantViaUMulHiMagic(N, Divisor, Result, HiLoVT, DAG, LL, + LH)) + return true; + return false; } diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index c57383ad9b1e7..0554b2e66a0be 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -513,13 +513,50 @@ entry: define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-SD-LABEL: ui128_7: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: mov x8, #9362 // =0x2492 +; CHECK-SD-NEXT: mov x11, #18725 // =0x4925 +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: movk x11, #9362, lsl #16 +; CHECK-SD-NEXT: movk x8, #18724, lsl #32 +; CHECK-SD-NEXT: movk x11, #37449, lsl #32 +; CHECK-SD-NEXT: movk x8, #9362, lsl #48 +; CHECK-SD-NEXT: movk x11, #18724, lsl #48 +; CHECK-SD-NEXT: mul x10, x0, x8 +; CHECK-SD-NEXT: umulh x12, x0, x11 +; CHECK-SD-NEXT: umulh x9, x0, x8 +; CHECK-SD-NEXT: umulh x14, x1, x11 +; CHECK-SD-NEXT: adds x10, x12, x10 +; CHECK-SD-NEXT: mul x11, x1, x11 +; CHECK-SD-NEXT: cinc x9, x9, hs +; CHECK-SD-NEXT: umulh x13, x1, x8 +; CHECK-SD-NEXT: mul x8, x1, x8 +; CHECK-SD-NEXT: cmn x10, x11 +; CHECK-SD-NEXT: adcs x9, x9, x14 +; CHECK-SD-NEXT: cinc x10, x13, hs +; CHECK-SD-NEXT: adds x11, x9, x8 +; CHECK-SD-NEXT: cinc x12, x10, hs +; CHECK-SD-NEXT: subs x13, x0, x11 +; CHECK-SD-NEXT: cset w14, lo +; CHECK-SD-NEXT: sub x14, x1, x14 +; CHECK-SD-NEXT: sub x12, x14, x12 +; CHECK-SD-NEXT: extr x13, x12, x13, #1 +; CHECK-SD-NEXT: lsr x12, x12, #1 +; CHECK-SD-NEXT: adds x11, x13, x11 +; CHECK-SD-NEXT: cinc x12, x12, hs +; CHECK-SD-NEXT: cmn x9, x8 +; CHECK-SD-NEXT: adc x8, x12, x10 +; CHECK-SD-NEXT: mov w10, #7 // =0x7 +; CHECK-SD-NEXT: extr x9, x8, x11, #2 +; CHECK-SD-NEXT: lsr x8, x8, #2 +; CHECK-SD-NEXT: umulh x10, x9, x10 +; CHECK-SD-NEXT: lsl x11, x9, #3 +; CHECK-SD-NEXT: sub x9, x11, x9 +; CHECK-SD-NEXT: subs x0, x0, x9 +; CHECK-SD-NEXT: cset w9, lo +; CHECK-SD-NEXT: sub x10, x10, x8 +; CHECK-SD-NEXT: sub x9, x1, x9 +; CHECK-SD-NEXT: add x8, x10, x8, lsl #3 +; CHECK-SD-NEXT: sub x1, x9, x8 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: ui128_7: @@ -596,13 +633,38 @@ entry: define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-SD-LABEL: ui128_100: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2 +; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29 +; CHECK-SD-NEXT: movk x8, #23592, lsl #16 +; CHECK-SD-NEXT: movk x11, #49807, lsl #16 +; CHECK-SD-NEXT: movk x8, #49807, lsl #32 +; CHECK-SD-NEXT: movk x11, #10485, lsl #32 +; CHECK-SD-NEXT: movk x8, #10485, lsl #48 +; CHECK-SD-NEXT: movk x11, #36700, lsl #48 +; CHECK-SD-NEXT: mul x10, x0, x8 +; CHECK-SD-NEXT: umulh x12, x0, x11 +; CHECK-SD-NEXT: umulh x9, x0, x8 +; CHECK-SD-NEXT: umulh x14, x1, x11 +; CHECK-SD-NEXT: adds x10, x12, x10 +; CHECK-SD-NEXT: mul x11, x1, x11 +; CHECK-SD-NEXT: cinc x9, x9, hs +; CHECK-SD-NEXT: umulh x13, x1, x8 +; CHECK-SD-NEXT: mul x8, x1, x8 +; CHECK-SD-NEXT: cmn x10, x11 +; CHECK-SD-NEXT: adcs x9, x9, x14 +; CHECK-SD-NEXT: cinc x10, x13, hs +; CHECK-SD-NEXT: adds x8, x9, x8 +; CHECK-SD-NEXT: cinc x9, x10, hs +; CHECK-SD-NEXT: mov w10, #100 // =0x64 +; CHECK-SD-NEXT: extr x8, x9, x8, #4 +; CHECK-SD-NEXT: lsr x9, x9, #4 +; CHECK-SD-NEXT: umulh x11, x8, x10 +; CHECK-SD-NEXT: mul x8, x8, x10 +; CHECK-SD-NEXT: madd x9, x9, x10, x11 +; CHECK-SD-NEXT: subs x0, x0, x8 +; CHECK-SD-NEXT: cset w8, lo +; CHECK-SD-NEXT: sub x8, x1, x8 +; CHECK-SD-NEXT: sub x1, x8, x9 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: ui128_100: @@ -3204,34 +3266,85 @@ entry: define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-SD-LABEL: uv2i128_7: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill -; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 -; CHECK-SD-NEXT: .cfi_offset w19, -8 -; CHECK-SD-NEXT: .cfi_offset w20, -16 -; CHECK-SD-NEXT: .cfi_offset w21, -24 -; CHECK-SD-NEXT: .cfi_offset w22, -32 -; CHECK-SD-NEXT: .cfi_offset w30, -48 -; CHECK-SD-NEXT: mov x19, x3 -; CHECK-SD-NEXT: mov x20, x2 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x21, x0 -; CHECK-SD-NEXT: mov x22, x1 -; CHECK-SD-NEXT: mov x0, x20 -; CHECK-SD-NEXT: mov x1, x19 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x2, x0 -; CHECK-SD-NEXT: mov x3, x1 -; CHECK-SD-NEXT: mov x0, x21 -; CHECK-SD-NEXT: mov x1, x22 -; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: mov x8, #9362 // =0x2492 +; CHECK-SD-NEXT: mov x11, #18725 // =0x4925 +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: movk x11, #9362, lsl #16 +; CHECK-SD-NEXT: movk x8, #18724, lsl #32 +; CHECK-SD-NEXT: movk x11, #37449, lsl #32 +; CHECK-SD-NEXT: movk x8, #9362, lsl #48 +; CHECK-SD-NEXT: movk x11, #18724, lsl #48 +; CHECK-SD-NEXT: mul x10, x0, x8 +; CHECK-SD-NEXT: umulh x12, x0, x11 +; CHECK-SD-NEXT: umulh x9, x0, x8 +; CHECK-SD-NEXT: mul x15, x1, x11 +; CHECK-SD-NEXT: adds x10, x12, x10 +; CHECK-SD-NEXT: umulh x14, x1, x11 +; CHECK-SD-NEXT: cinc x9, x9, hs +; CHECK-SD-NEXT: umulh x13, x1, x8 +; CHECK-SD-NEXT: cmn x10, x15 +; CHECK-SD-NEXT: mul x16, x1, x8 +; CHECK-SD-NEXT: adcs x9, x9, x14 +; CHECK-SD-NEXT: mul x12, x2, x8 +; CHECK-SD-NEXT: cinc x13, x13, hs +; CHECK-SD-NEXT: umulh x10, x2, x11 +; CHECK-SD-NEXT: adds x14, x9, x16 +; CHECK-SD-NEXT: cinc x15, x13, hs +; CHECK-SD-NEXT: subs x18, x0, x14 +; CHECK-SD-NEXT: umulh x17, x2, x8 +; CHECK-SD-NEXT: cset w5, lo +; CHECK-SD-NEXT: sub x5, x1, x5 +; CHECK-SD-NEXT: umulh x6, x3, x11 +; CHECK-SD-NEXT: sub x15, x5, x15 +; CHECK-SD-NEXT: extr x18, x15, x18, #1 +; CHECK-SD-NEXT: mul x11, x3, x11 +; CHECK-SD-NEXT: lsr x15, x15, #1 +; CHECK-SD-NEXT: umulh x4, x3, x8 +; CHECK-SD-NEXT: adds x14, x18, x14 +; CHECK-SD-NEXT: cinc x15, x15, hs +; CHECK-SD-NEXT: cmn x9, x16 +; CHECK-SD-NEXT: mul x8, x3, x8 +; CHECK-SD-NEXT: adc x9, x15, x13 +; CHECK-SD-NEXT: adds x10, x10, x12 +; CHECK-SD-NEXT: cinc x12, x17, hs +; CHECK-SD-NEXT: cmn x10, x11 +; CHECK-SD-NEXT: adcs x10, x12, x6 +; CHECK-SD-NEXT: cinc x11, x4, hs +; CHECK-SD-NEXT: adds x12, x10, x8 +; CHECK-SD-NEXT: cinc x13, x11, hs +; CHECK-SD-NEXT: subs x15, x2, x12 +; CHECK-SD-NEXT: cset w16, lo +; CHECK-SD-NEXT: sub x16, x3, x16 +; CHECK-SD-NEXT: sub x13, x16, x13 +; CHECK-SD-NEXT: extr x15, x13, x15, #1 +; CHECK-SD-NEXT: lsr x13, x13, #1 +; CHECK-SD-NEXT: adds x12, x15, x12 +; CHECK-SD-NEXT: cinc x13, x13, hs +; CHECK-SD-NEXT: cmn x10, x8 +; CHECK-SD-NEXT: extr x8, x9, x14, #2 +; CHECK-SD-NEXT: adc x10, x13, x11 +; CHECK-SD-NEXT: mov w11, #7 // =0x7 +; CHECK-SD-NEXT: lsr x9, x9, #2 +; CHECK-SD-NEXT: extr x12, x10, x12, #2 +; CHECK-SD-NEXT: umulh x13, x8, x11 +; CHECK-SD-NEXT: lsl x14, x8, #3 +; CHECK-SD-NEXT: lsr x10, x10, #2 +; CHECK-SD-NEXT: umulh x11, x12, x11 +; CHECK-SD-NEXT: lsl x15, x12, #3 +; CHECK-SD-NEXT: sub x8, x14, x8 +; CHECK-SD-NEXT: subs x0, x0, x8 +; CHECK-SD-NEXT: sub x8, x15, x12 +; CHECK-SD-NEXT: cset w12, lo +; CHECK-SD-NEXT: sub x13, x13, x9 +; CHECK-SD-NEXT: subs x2, x2, x8 +; CHECK-SD-NEXT: add x8, x13, x9, lsl #3 +; CHECK-SD-NEXT: sub x11, x11, x10 +; CHECK-SD-NEXT: add x9, x11, x10, lsl #3 +; CHECK-SD-NEXT: cset w10, lo +; CHECK-SD-NEXT: sub x11, x1, x12 +; CHECK-SD-NEXT: sub x10, x3, x10 +; CHECK-SD-NEXT: sub x1, x11, x8 +; CHECK-SD-NEXT: sub x3, x10, x9 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv2i128_7: @@ -3361,34 +3474,61 @@ entry: define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-SD-LABEL: uv2i128_100: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill -; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 -; CHECK-SD-NEXT: .cfi_offset w19, -8 -; CHECK-SD-NEXT: .cfi_offset w20, -16 -; CHECK-SD-NEXT: .cfi_offset w21, -24 -; CHECK-SD-NEXT: .cfi_offset w22, -32 -; CHECK-SD-NEXT: .cfi_offset w30, -48 -; CHECK-SD-NEXT: mov x19, x3 -; CHECK-SD-NEXT: mov x20, x2 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x21, x0 -; CHECK-SD-NEXT: mov x22, x1 -; CHECK-SD-NEXT: mov x0, x20 -; CHECK-SD-NEXT: mov x1, x19 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x2, x0 -; CHECK-SD-NEXT: mov x3, x1 -; CHECK-SD-NEXT: mov x0, x21 -; CHECK-SD-NEXT: mov x1, x22 -; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2 +; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29 +; CHECK-SD-NEXT: movk x8, #23592, lsl #16 +; CHECK-SD-NEXT: movk x11, #49807, lsl #16 +; CHECK-SD-NEXT: movk x8, #49807, lsl #32 +; CHECK-SD-NEXT: movk x11, #10485, lsl #32 +; CHECK-SD-NEXT: movk x8, #10485, lsl #48 +; CHECK-SD-NEXT: movk x11, #36700, lsl #48 +; CHECK-SD-NEXT: mul x10, x0, x8 +; CHECK-SD-NEXT: umulh x12, x0, x11 +; CHECK-SD-NEXT: umulh x9, x0, x8 +; CHECK-SD-NEXT: mul x15, x1, x11 +; CHECK-SD-NEXT: adds x10, x12, x10 +; CHECK-SD-NEXT: mov w12, #100 // =0x64 +; CHECK-SD-NEXT: umulh x14, x1, x11 +; CHECK-SD-NEXT: cinc x9, x9, hs +; CHECK-SD-NEXT: umulh x13, x1, x8 +; CHECK-SD-NEXT: cmn x10, x15 +; CHECK-SD-NEXT: mul x16, x1, x8 +; CHECK-SD-NEXT: adcs x9, x9, x14 +; CHECK-SD-NEXT: mul x15, x2, x8 +; CHECK-SD-NEXT: cinc x10, x13, hs +; CHECK-SD-NEXT: umulh x14, x2, x8 +; CHECK-SD-NEXT: adds x9, x9, x16 +; CHECK-SD-NEXT: cinc x10, x10, hs +; CHECK-SD-NEXT: umulh x16, x2, x11 +; CHECK-SD-NEXT: extr x9, x10, x9, #4 +; CHECK-SD-NEXT: lsr x10, x10, #4 +; CHECK-SD-NEXT: umulh x18, x3, x11 +; CHECK-SD-NEXT: mul x13, x9, x12 +; CHECK-SD-NEXT: mul x11, x3, x11 +; CHECK-SD-NEXT: umulh x17, x3, x8 +; CHECK-SD-NEXT: subs x0, x0, x13 +; CHECK-SD-NEXT: mul x8, x3, x8 +; CHECK-SD-NEXT: cset w13, lo +; CHECK-SD-NEXT: adds x15, x16, x15 +; CHECK-SD-NEXT: cinc x14, x14, hs +; CHECK-SD-NEXT: cmn x15, x11 +; CHECK-SD-NEXT: adcs x11, x14, x18 +; CHECK-SD-NEXT: umulh x9, x9, x12 +; CHECK-SD-NEXT: cinc x14, x17, hs +; CHECK-SD-NEXT: adds x8, x11, x8 +; CHECK-SD-NEXT: madd x9, x10, x12, x9 +; CHECK-SD-NEXT: cinc x11, x14, hs +; CHECK-SD-NEXT: extr x8, x11, x8, #4 +; CHECK-SD-NEXT: lsr x11, x11, #4 +; CHECK-SD-NEXT: umulh x14, x8, x12 +; CHECK-SD-NEXT: mul x8, x8, x12 +; CHECK-SD-NEXT: madd x10, x11, x12, x14 +; CHECK-SD-NEXT: sub x11, x1, x13 +; CHECK-SD-NEXT: sub x1, x11, x9 +; CHECK-SD-NEXT: subs x2, x2, x8 +; CHECK-SD-NEXT: cset w8, lo +; CHECK-SD-NEXT: sub x8, x3, x8 +; CHECK-SD-NEXT: sub x3, x8, x10 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv2i128_100: diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll index 191155ae30f3e..77bed94918f2a 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -47,67 +47,77 @@ declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; SCALAR-LABEL: fshl_i37: ; SCALAR: @ %bb.0: -; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: mov r8, r0 -; SCALAR-NEXT: ldr r0, [sp, #28] -; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r5, r3 -; SCALAR-NEXT: and r1, r0, #31 -; SCALAR-NEXT: ldr r0, [sp, #24] -; SCALAR-NEXT: mov r6, r2 -; SCALAR-NEXT: mov r2, #37 -; SCALAR-NEXT: mov r3, #0 -; SCALAR-NEXT: bl __aeabi_uldivmod -; SCALAR-NEXT: lsl r0, r5, #27 -; SCALAR-NEXT: tst r2, #32 -; SCALAR-NEXT: orr r0, r0, r6, lsr #5 -; SCALAR-NEXT: mov r1, r8 -; SCALAR-NEXT: and r3, r2, #31 -; SCALAR-NEXT: mov r7, #31 +; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr} +; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr} +; SCALAR-NEXT: ldr lr, [sp, #24] +; SCALAR-NEXT: movw r12, #46053 +; SCALAR-NEXT: movt r12, #12398 +; SCALAR-NEXT: movw r6, #15941 +; SCALAR-NEXT: ldr r7, [sp, #28] +; SCALAR-NEXT: movt r6, #1771 +; SCALAR-NEXT: umull r4, r5, lr, r12 +; SCALAR-NEXT: lsl r3, r3, #27 +; SCALAR-NEXT: mov r4, #0 +; SCALAR-NEXT: and r7, r7, #31 +; SCALAR-NEXT: umlal r5, r4, lr, r6 +; SCALAR-NEXT: orr r3, r3, r2, lsr #5 +; SCALAR-NEXT: umlal r5, r4, r7, r12 +; SCALAR-NEXT: mla r7, r7, r6, r4 +; SCALAR-NEXT: mov r6, #37 +; SCALAR-NEXT: mls r7, r7, r6, lr +; SCALAR-NEXT: mov r6, r0 +; SCALAR-NEXT: tst r7, #32 +; SCALAR-NEXT: and r5, r7, #31 +; SCALAR-NEXT: movne r6, r3 +; SCALAR-NEXT: lslne r3, r2, #27 +; SCALAR-NEXT: lsr r2, r3, #1 +; SCALAR-NEXT: mov r3, #31 +; SCALAR-NEXT: bic r3, r3, r7 ; SCALAR-NEXT: movne r1, r0 -; SCALAR-NEXT: lslne r0, r6, #27 -; SCALAR-NEXT: bic r2, r7, r2 -; SCALAR-NEXT: lsl r5, r1, r3 -; SCALAR-NEXT: lsr r0, r0, #1 -; SCALAR-NEXT: movne r4, r8 -; SCALAR-NEXT: lsr r1, r1, #1 -; SCALAR-NEXT: lsl r3, r4, r3 -; SCALAR-NEXT: orr r0, r5, r0, lsr r2 -; SCALAR-NEXT: orr r1, r3, r1, lsr r2 -; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc} +; SCALAR-NEXT: lsl r4, r6, r5 +; SCALAR-NEXT: lsl r0, r1, r5 +; SCALAR-NEXT: lsr r1, r6, #1 +; SCALAR-NEXT: orr r2, r4, r2, lsr r3 +; SCALAR-NEXT: orr r1, r0, r1, lsr r3 +; SCALAR-NEXT: mov r0, r2 +; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; NEON-LABEL: fshl_i37: ; NEON: @ %bb.0: -; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: push {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: mov r4, r1 -; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r8, r0 -; NEON-NEXT: ldr r0, [sp, #24] -; NEON-NEXT: and r1, r1, #31 -; NEON-NEXT: mov r5, r3 -; NEON-NEXT: mov r6, r2 -; NEON-NEXT: mov r2, #37 -; NEON-NEXT: mov r3, #0 -; NEON-NEXT: bl __aeabi_uldivmod -; NEON-NEXT: lsl r0, r5, #27 -; NEON-NEXT: tst r2, #32 -; NEON-NEXT: orr r0, r0, r6, lsr #5 -; NEON-NEXT: mov r1, r8 -; NEON-NEXT: and r3, r2, #31 -; NEON-NEXT: mov r7, #31 +; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: ldr r12, [sp, #24] +; NEON-NEXT: movw lr, #46053 +; NEON-NEXT: movt lr, #12398 +; NEON-NEXT: ldr r6, [sp, #28] +; NEON-NEXT: mov r7, #0 +; NEON-NEXT: lsl r3, r3, #27 +; NEON-NEXT: umull r4, r5, r12, lr +; NEON-NEXT: and r6, r6, #31 +; NEON-NEXT: movw r4, #15941 +; NEON-NEXT: movt r4, #1771 +; NEON-NEXT: umlal r5, r7, r12, r4 +; NEON-NEXT: orr r3, r3, r2, lsr #5 +; NEON-NEXT: umlal r5, r7, r6, lr +; NEON-NEXT: mla r7, r6, r4, r7 +; NEON-NEXT: mov r6, #37 +; NEON-NEXT: mls r7, r7, r6, r12 +; NEON-NEXT: mov r6, r0 +; NEON-NEXT: tst r7, #32 +; NEON-NEXT: and r5, r7, #31 +; NEON-NEXT: movne r6, r3 +; NEON-NEXT: lslne r3, r2, #27 +; NEON-NEXT: lsr r2, r3, #1 +; NEON-NEXT: mov r3, #31 +; NEON-NEXT: bic r3, r3, r7 ; NEON-NEXT: movne r1, r0 -; NEON-NEXT: lslne r0, r6, #27 -; NEON-NEXT: bic r2, r7, r2 -; NEON-NEXT: lsl r5, r1, r3 -; NEON-NEXT: lsr r0, r0, #1 -; NEON-NEXT: movne r4, r8 -; NEON-NEXT: lsr r1, r1, #1 -; NEON-NEXT: lsl r3, r4, r3 -; NEON-NEXT: orr r0, r5, r0, lsr r2 -; NEON-NEXT: orr r1, r3, r1, lsr r2 -; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc} +; NEON-NEXT: lsl r4, r6, r5 +; NEON-NEXT: lsl r0, r1, r5 +; NEON-NEXT: lsr r1, r6, #1 +; NEON-NEXT: orr r2, r4, r2, lsr r3 +; NEON-NEXT: orr r1, r0, r1, lsr r3 +; NEON-NEXT: mov r0, r2 +; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } @@ -237,66 +247,76 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; SCALAR: @ %bb.0: ; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr} ; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr} -; SCALAR-NEXT: mov r5, r0 -; SCALAR-NEXT: ldr r0, [sp, #28] -; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r6, r3 -; SCALAR-NEXT: and r1, r0, #31 -; SCALAR-NEXT: ldr r0, [sp, #24] -; SCALAR-NEXT: mov r7, r2 -; SCALAR-NEXT: mov r2, #37 -; SCALAR-NEXT: mov r3, #0 -; SCALAR-NEXT: bl __aeabi_uldivmod -; SCALAR-NEXT: add r0, r2, #27 -; SCALAR-NEXT: lsl r2, r6, #27 -; SCALAR-NEXT: orr r2, r2, r7, lsr #5 -; SCALAR-NEXT: mov r1, #31 -; SCALAR-NEXT: tst r0, #32 -; SCALAR-NEXT: mov r3, r5 -; SCALAR-NEXT: moveq r3, r2 -; SCALAR-NEXT: lsleq r2, r7, #27 -; SCALAR-NEXT: bic r1, r1, r0 -; SCALAR-NEXT: and r7, r0, #31 -; SCALAR-NEXT: lsl r6, r3, #1 -; SCALAR-NEXT: moveq r4, r5 -; SCALAR-NEXT: lsl r6, r6, r1 -; SCALAR-NEXT: orr r0, r6, r2, lsr r7 -; SCALAR-NEXT: lsl r2, r4, #1 -; SCALAR-NEXT: lsl r1, r2, r1 -; SCALAR-NEXT: orr r1, r1, r3, lsr r7 +; SCALAR-NEXT: ldr lr, [sp, #24] +; SCALAR-NEXT: movw r12, #46053 +; SCALAR-NEXT: movt r12, #12398 +; SCALAR-NEXT: movw r6, #15941 +; SCALAR-NEXT: ldr r7, [sp, #28] +; SCALAR-NEXT: movt r6, #1771 +; SCALAR-NEXT: umull r4, r5, lr, r12 +; SCALAR-NEXT: lsl r3, r3, #27 +; SCALAR-NEXT: mov r4, #0 +; SCALAR-NEXT: and r7, r7, #31 +; SCALAR-NEXT: umlal r5, r4, lr, r6 +; SCALAR-NEXT: orr r3, r3, r2, lsr #5 +; SCALAR-NEXT: umlal r5, r4, r7, r12 +; SCALAR-NEXT: mov r5, #31 +; SCALAR-NEXT: mla r7, r7, r6, r4 +; SCALAR-NEXT: mov r6, #37 +; SCALAR-NEXT: mls r7, r7, r6, lr +; SCALAR-NEXT: mov r6, r0 +; SCALAR-NEXT: add r7, r7, #27 +; SCALAR-NEXT: tst r7, #32 +; SCALAR-NEXT: bic r5, r5, r7 +; SCALAR-NEXT: moveq r6, r3 +; SCALAR-NEXT: lsleq r3, r2, #27 +; SCALAR-NEXT: lsl r2, r6, #1 +; SCALAR-NEXT: and r7, r7, #31 +; SCALAR-NEXT: lsl r2, r2, r5 +; SCALAR-NEXT: moveq r1, r0 +; SCALAR-NEXT: lsl r0, r1, #1 +; SCALAR-NEXT: orr r2, r2, r3, lsr r7 +; SCALAR-NEXT: lsl r0, r0, r5 +; SCALAR-NEXT: orr r1, r0, r6, lsr r7 +; SCALAR-NEXT: mov r0, r2 ; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc} ; ; NEON-LABEL: fshr_i37: ; NEON: @ %bb.0: ; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} ; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} -; NEON-NEXT: mov r4, r1 -; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r5, r0 -; NEON-NEXT: ldr r0, [sp, #24] -; NEON-NEXT: and r1, r1, #31 -; NEON-NEXT: mov r6, r3 -; NEON-NEXT: mov r7, r2 -; NEON-NEXT: mov r2, #37 -; NEON-NEXT: mov r3, #0 -; NEON-NEXT: bl __aeabi_uldivmod -; NEON-NEXT: add r0, r2, #27 -; NEON-NEXT: lsl r2, r6, #27 -; NEON-NEXT: orr r2, r2, r7, lsr #5 -; NEON-NEXT: mov r1, #31 -; NEON-NEXT: tst r0, #32 -; NEON-NEXT: mov r3, r5 -; NEON-NEXT: moveq r3, r2 -; NEON-NEXT: lsleq r2, r7, #27 -; NEON-NEXT: bic r1, r1, r0 -; NEON-NEXT: and r7, r0, #31 -; NEON-NEXT: lsl r6, r3, #1 -; NEON-NEXT: moveq r4, r5 -; NEON-NEXT: lsl r6, r6, r1 -; NEON-NEXT: orr r0, r6, r2, lsr r7 -; NEON-NEXT: lsl r2, r4, #1 -; NEON-NEXT: lsl r1, r2, r1 -; NEON-NEXT: orr r1, r1, r3, lsr r7 +; NEON-NEXT: ldr r12, [sp, #24] +; NEON-NEXT: movw lr, #46053 +; NEON-NEXT: movt lr, #12398 +; NEON-NEXT: ldr r6, [sp, #28] +; NEON-NEXT: mov r7, #0 +; NEON-NEXT: lsl r3, r3, #27 +; NEON-NEXT: umull r4, r5, r12, lr +; NEON-NEXT: and r6, r6, #31 +; NEON-NEXT: movw r4, #15941 +; NEON-NEXT: movt r4, #1771 +; NEON-NEXT: umlal r5, r7, r12, r4 +; NEON-NEXT: orr r3, r3, r2, lsr #5 +; NEON-NEXT: umlal r5, r7, r6, lr +; NEON-NEXT: mov r5, #31 +; NEON-NEXT: mla r7, r6, r4, r7 +; NEON-NEXT: mov r6, #37 +; NEON-NEXT: mls r7, r7, r6, r12 +; NEON-NEXT: mov r6, r0 +; NEON-NEXT: add r7, r7, #27 +; NEON-NEXT: tst r7, #32 +; NEON-NEXT: bic r5, r5, r7 +; NEON-NEXT: moveq r6, r3 +; NEON-NEXT: lsleq r3, r2, #27 +; NEON-NEXT: lsl r2, r6, #1 +; NEON-NEXT: and r7, r7, #31 +; NEON-NEXT: lsl r2, r2, r5 +; NEON-NEXT: moveq r1, r0 +; NEON-NEXT: lsl r0, r1, #1 +; NEON-NEXT: orr r2, r2, r3, lsr r7 +; NEON-NEXT: lsl r0, r0, r5 +; NEON-NEXT: orr r1, r0, r6, lsr r7 +; NEON-NEXT: mov r0, r2 ; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll index 99e0d47441a02..51e212c8c5ae0 100644 --- a/llvm/test/CodeGen/Mips/funnel-shift.ll +++ b/llvm/test/CodeGen/Mips/funnel-shift.ll @@ -48,105 +48,106 @@ declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-BE-LABEL: fshl_i37: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: addiu $sp, $sp, -40 -; CHECK-BE-NEXT: .cfi_def_cfa_offset 40 -; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: .cfi_offset 31, -4 -; CHECK-BE-NEXT: .cfi_offset 19, -8 -; CHECK-BE-NEXT: .cfi_offset 18, -12 -; CHECK-BE-NEXT: .cfi_offset 17, -16 -; CHECK-BE-NEXT: .cfi_offset 16, -20 -; CHECK-BE-NEXT: move $16, $7 -; CHECK-BE-NEXT: move $17, $6 -; CHECK-BE-NEXT: move $18, $5 -; CHECK-BE-NEXT: move $19, $4 -; CHECK-BE-NEXT: lw $1, 56($sp) -; CHECK-BE-NEXT: andi $4, $1, 31 -; CHECK-BE-NEXT: lw $5, 60($sp) -; CHECK-BE-NEXT: addiu $6, $zero, 0 -; CHECK-BE-NEXT: jal __umoddi3 -; CHECK-BE-NEXT: addiu $7, $zero, 37 -; CHECK-BE-NEXT: srl $1, $3, 5 -; CHECK-BE-NEXT: andi $1, $1, 1 -; CHECK-BE-NEXT: movn $19, $18, $1 -; CHECK-BE-NEXT: sllv $2, $19, $3 -; CHECK-BE-NEXT: not $4, $3 -; CHECK-BE-NEXT: srl $5, $16, 5 -; CHECK-BE-NEXT: sll $6, $17, 27 -; CHECK-BE-NEXT: or $5, $6, $5 -; CHECK-BE-NEXT: movn $18, $5, $1 -; CHECK-BE-NEXT: srl $6, $18, 1 -; CHECK-BE-NEXT: srlv $6, $6, $4 -; CHECK-BE-NEXT: or $2, $2, $6 -; CHECK-BE-NEXT: sllv $3, $18, $3 -; CHECK-BE-NEXT: sll $6, $16, 27 -; CHECK-BE-NEXT: movn $5, $6, $1 -; CHECK-BE-NEXT: srl $1, $5, 1 -; CHECK-BE-NEXT: srlv $1, $1, $4 -; CHECK-BE-NEXT: or $3, $3, $1 -; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; CHECK-BE-NEXT: lui $1, 1771 +; CHECK-BE-NEXT: ori $1, $1, 15941 +; CHECK-BE-NEXT: lw $2, 20($sp) +; CHECK-BE-NEXT: multu $2, $1 +; CHECK-BE-NEXT: mfhi $3 +; CHECK-BE-NEXT: mflo $8 +; CHECK-BE-NEXT: lui $9, 12398 +; CHECK-BE-NEXT: ori $9, $9, 46053 +; CHECK-BE-NEXT: multu $2, $9 +; CHECK-BE-NEXT: mfhi $10 +; CHECK-BE-NEXT: lw $11, 16($sp) +; CHECK-BE-NEXT: andi $11, $11, 31 +; CHECK-BE-NEXT: multu $11, $9 +; CHECK-BE-NEXT: mflo $9 +; CHECK-BE-NEXT: mfhi $12 +; CHECK-BE-NEXT: addu $8, $10, $8 +; CHECK-BE-NEXT: sltu $10, $8, $10 +; CHECK-BE-NEXT: addu $9, $8, $9 +; CHECK-BE-NEXT: sltu $8, $9, $8 +; CHECK-BE-NEXT: addu $3, $3, $10 +; CHECK-BE-NEXT: srl $9, $7, 5 +; CHECK-BE-NEXT: sll $6, $6, 27 +; CHECK-BE-NEXT: or $6, $6, $9 +; CHECK-BE-NEXT: addu $3, $3, $12 +; CHECK-BE-NEXT: sll $7, $7, 27 +; CHECK-BE-NEXT: addu $3, $3, $8 +; CHECK-BE-NEXT: mul $1, $11, $1 +; CHECK-BE-NEXT: addu $1, $3, $1 +; CHECK-BE-NEXT: sll $3, $1, 2 +; CHECK-BE-NEXT: addu $3, $3, $1 +; CHECK-BE-NEXT: sll $1, $1, 5 +; CHECK-BE-NEXT: addu $1, $1, $3 +; CHECK-BE-NEXT: subu $1, $2, $1 +; CHECK-BE-NEXT: andi $2, $1, 32 +; CHECK-BE-NEXT: srl $3, $2, 5 +; CHECK-BE-NEXT: movn $4, $5, $3 +; CHECK-BE-NEXT: sllv $2, $4, $1 +; CHECK-BE-NEXT: not $4, $1 +; CHECK-BE-NEXT: movn $5, $6, $3 +; CHECK-BE-NEXT: srl $8, $5, 1 +; CHECK-BE-NEXT: srlv $8, $8, $4 +; CHECK-BE-NEXT: or $2, $2, $8 +; CHECK-BE-NEXT: sllv $1, $5, $1 +; CHECK-BE-NEXT: movn $6, $7, $3 +; CHECK-BE-NEXT: srl $3, $6, 1 +; CHECK-BE-NEXT: srlv $3, $3, $4 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: addiu $sp, $sp, 40 +; CHECK-BE-NEXT: or $3, $1, $3 ; ; CHECK-LE-LABEL: fshl_i37: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: addiu $sp, $sp, -40 -; CHECK-LE-NEXT: .cfi_def_cfa_offset 40 -; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: .cfi_offset 31, -4 -; CHECK-LE-NEXT: .cfi_offset 19, -8 -; CHECK-LE-NEXT: .cfi_offset 18, -12 -; CHECK-LE-NEXT: .cfi_offset 17, -16 -; CHECK-LE-NEXT: .cfi_offset 16, -20 -; CHECK-LE-NEXT: move $16, $7 -; CHECK-LE-NEXT: move $17, $6 -; CHECK-LE-NEXT: move $18, $5 -; CHECK-LE-NEXT: move $19, $4 -; CHECK-LE-NEXT: lw $1, 60($sp) -; CHECK-LE-NEXT: andi $5, $1, 31 -; CHECK-LE-NEXT: lw $4, 56($sp) -; CHECK-LE-NEXT: addiu $6, $zero, 37 -; CHECK-LE-NEXT: jal __umoddi3 -; CHECK-LE-NEXT: addiu $7, $zero, 0 -; CHECK-LE-NEXT: srl $1, $2, 5 -; CHECK-LE-NEXT: andi $3, $1, 1 -; CHECK-LE-NEXT: srl $1, $17, 5 -; CHECK-LE-NEXT: sll $4, $16, 27 -; CHECK-LE-NEXT: or $1, $4, $1 -; CHECK-LE-NEXT: move $4, $19 -; CHECK-LE-NEXT: movn $4, $1, $3 -; CHECK-LE-NEXT: sllv $5, $4, $2 -; CHECK-LE-NEXT: not $6, $2 -; CHECK-LE-NEXT: sll $7, $17, 27 -; CHECK-LE-NEXT: movn $1, $7, $3 -; CHECK-LE-NEXT: srl $1, $1, 1 -; CHECK-LE-NEXT: srlv $1, $1, $6 -; CHECK-LE-NEXT: or $1, $5, $1 -; CHECK-LE-NEXT: movn $18, $19, $3 -; CHECK-LE-NEXT: sllv $2, $18, $2 -; CHECK-LE-NEXT: srl $3, $4, 1 -; CHECK-LE-NEXT: srlv $3, $3, $6 -; CHECK-LE-NEXT: or $3, $2, $3 -; CHECK-LE-NEXT: move $2, $1 -; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; CHECK-LE-NEXT: lui $1, 1771 +; CHECK-LE-NEXT: ori $1, $1, 15941 +; CHECK-LE-NEXT: lw $2, 16($sp) +; CHECK-LE-NEXT: multu $2, $1 +; CHECK-LE-NEXT: mfhi $3 +; CHECK-LE-NEXT: mflo $8 +; CHECK-LE-NEXT: lui $9, 12398 +; CHECK-LE-NEXT: ori $9, $9, 46053 +; CHECK-LE-NEXT: multu $2, $9 +; CHECK-LE-NEXT: mfhi $10 +; CHECK-LE-NEXT: lw $11, 20($sp) +; CHECK-LE-NEXT: andi $11, $11, 31 +; CHECK-LE-NEXT: multu $11, $9 +; CHECK-LE-NEXT: mflo $9 +; CHECK-LE-NEXT: mfhi $12 +; CHECK-LE-NEXT: addu $8, $10, $8 +; CHECK-LE-NEXT: sltu $10, $8, $10 +; CHECK-LE-NEXT: addu $9, $8, $9 +; CHECK-LE-NEXT: sltu $8, $9, $8 +; CHECK-LE-NEXT: addu $3, $3, $10 +; CHECK-LE-NEXT: srl $9, $6, 5 +; CHECK-LE-NEXT: sll $7, $7, 27 +; CHECK-LE-NEXT: or $7, $7, $9 +; CHECK-LE-NEXT: sll $6, $6, 27 +; CHECK-LE-NEXT: addu $3, $3, $12 +; CHECK-LE-NEXT: addu $3, $3, $8 +; CHECK-LE-NEXT: mul $1, $11, $1 +; CHECK-LE-NEXT: addu $1, $3, $1 +; CHECK-LE-NEXT: sll $3, $1, 2 +; CHECK-LE-NEXT: addu $3, $3, $1 +; CHECK-LE-NEXT: sll $1, $1, 5 +; CHECK-LE-NEXT: addu $1, $1, $3 +; CHECK-LE-NEXT: subu $1, $2, $1 +; CHECK-LE-NEXT: andi $2, $1, 32 +; CHECK-LE-NEXT: srl $3, $2, 5 +; CHECK-LE-NEXT: move $8, $4 +; CHECK-LE-NEXT: movn $8, $7, $3 +; CHECK-LE-NEXT: sllv $2, $8, $1 +; CHECK-LE-NEXT: not $9, $1 +; CHECK-LE-NEXT: movn $7, $6, $3 +; CHECK-LE-NEXT: srl $6, $7, 1 +; CHECK-LE-NEXT: srlv $6, $6, $9 +; CHECK-LE-NEXT: or $2, $2, $6 +; CHECK-LE-NEXT: movn $5, $4, $3 +; CHECK-LE-NEXT: sllv $1, $5, $1 +; CHECK-LE-NEXT: srl $3, $8, 1 +; CHECK-LE-NEXT: srlv $3, $3, $9 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: addiu $sp, $sp, 40 +; CHECK-LE-NEXT: or $3, $1, $3 %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } @@ -288,104 +289,106 @@ declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-BE-LABEL: fshr_i37: ; CHECK-BE: # %bb.0: -; CHECK-BE-NEXT: addiu $sp, $sp, -40 -; CHECK-BE-NEXT: .cfi_def_cfa_offset 40 -; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill -; CHECK-BE-NEXT: .cfi_offset 31, -4 -; CHECK-BE-NEXT: .cfi_offset 19, -8 -; CHECK-BE-NEXT: .cfi_offset 18, -12 -; CHECK-BE-NEXT: .cfi_offset 17, -16 -; CHECK-BE-NEXT: .cfi_offset 16, -20 -; CHECK-BE-NEXT: move $16, $7 -; CHECK-BE-NEXT: move $17, $6 -; CHECK-BE-NEXT: move $18, $5 -; CHECK-BE-NEXT: move $19, $4 -; CHECK-BE-NEXT: lw $1, 56($sp) -; CHECK-BE-NEXT: andi $4, $1, 31 -; CHECK-BE-NEXT: lw $5, 60($sp) -; CHECK-BE-NEXT: addiu $6, $zero, 0 -; CHECK-BE-NEXT: jal __umoddi3 -; CHECK-BE-NEXT: addiu $7, $zero, 37 -; CHECK-BE-NEXT: addiu $1, $3, 27 +; CHECK-BE-NEXT: lui $1, 1771 +; CHECK-BE-NEXT: ori $1, $1, 15941 +; CHECK-BE-NEXT: lw $2, 20($sp) +; CHECK-BE-NEXT: multu $2, $1 +; CHECK-BE-NEXT: mfhi $3 +; CHECK-BE-NEXT: mflo $8 +; CHECK-BE-NEXT: lui $9, 12398 +; CHECK-BE-NEXT: ori $9, $9, 46053 +; CHECK-BE-NEXT: multu $2, $9 +; CHECK-BE-NEXT: mfhi $10 +; CHECK-BE-NEXT: lw $11, 16($sp) +; CHECK-BE-NEXT: andi $11, $11, 31 +; CHECK-BE-NEXT: multu $11, $9 +; CHECK-BE-NEXT: mflo $9 +; CHECK-BE-NEXT: mfhi $12 +; CHECK-BE-NEXT: addu $8, $10, $8 +; CHECK-BE-NEXT: sltu $10, $8, $10 +; CHECK-BE-NEXT: addu $9, $8, $9 +; CHECK-BE-NEXT: sltu $8, $9, $8 +; CHECK-BE-NEXT: addu $3, $3, $10 +; CHECK-BE-NEXT: srl $9, $7, 5 +; CHECK-BE-NEXT: sll $6, $6, 27 +; CHECK-BE-NEXT: or $6, $6, $9 +; CHECK-BE-NEXT: sll $7, $7, 27 +; CHECK-BE-NEXT: addu $3, $3, $12 +; CHECK-BE-NEXT: addu $3, $3, $8 +; CHECK-BE-NEXT: mul $1, $11, $1 +; CHECK-BE-NEXT: addu $1, $3, $1 +; CHECK-BE-NEXT: sll $3, $1, 2 +; CHECK-BE-NEXT: addu $3, $3, $1 +; CHECK-BE-NEXT: sll $1, $1, 5 +; CHECK-BE-NEXT: addu $1, $1, $3 +; CHECK-BE-NEXT: subu $1, $2, $1 +; CHECK-BE-NEXT: addiu $1, $1, 27 ; CHECK-BE-NEXT: andi $3, $1, 32 -; CHECK-BE-NEXT: srl $2, $16, 5 -; CHECK-BE-NEXT: sll $4, $17, 27 -; CHECK-BE-NEXT: or $4, $4, $2 -; CHECK-BE-NEXT: movz $19, $18, $3 -; CHECK-BE-NEXT: movz $18, $4, $3 -; CHECK-BE-NEXT: srlv $2, $18, $1 -; CHECK-BE-NEXT: not $5, $1 -; CHECK-BE-NEXT: sll $6, $19, 1 -; CHECK-BE-NEXT: sllv $6, $6, $5 -; CHECK-BE-NEXT: sll $7, $16, 27 -; CHECK-BE-NEXT: or $2, $6, $2 -; CHECK-BE-NEXT: movz $4, $7, $3 -; CHECK-BE-NEXT: srlv $1, $4, $1 -; CHECK-BE-NEXT: sll $3, $18, 1 -; CHECK-BE-NEXT: sllv $3, $3, $5 -; CHECK-BE-NEXT: or $3, $3, $1 -; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload -; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; CHECK-BE-NEXT: movz $4, $5, $3 +; CHECK-BE-NEXT: movz $5, $6, $3 +; CHECK-BE-NEXT: srlv $2, $5, $1 +; CHECK-BE-NEXT: not $8, $1 +; CHECK-BE-NEXT: sll $4, $4, 1 +; CHECK-BE-NEXT: sllv $4, $4, $8 +; CHECK-BE-NEXT: or $2, $4, $2 +; CHECK-BE-NEXT: movz $6, $7, $3 +; CHECK-BE-NEXT: srlv $1, $6, $1 +; CHECK-BE-NEXT: sll $3, $5, 1 +; CHECK-BE-NEXT: sllv $3, $3, $8 ; CHECK-BE-NEXT: jr $ra -; CHECK-BE-NEXT: addiu $sp, $sp, 40 +; CHECK-BE-NEXT: or $3, $3, $1 ; ; CHECK-LE-LABEL: fshr_i37: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: addiu $sp, $sp, -40 -; CHECK-LE-NEXT: .cfi_def_cfa_offset 40 -; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill -; CHECK-LE-NEXT: .cfi_offset 31, -4 -; CHECK-LE-NEXT: .cfi_offset 19, -8 -; CHECK-LE-NEXT: .cfi_offset 18, -12 -; CHECK-LE-NEXT: .cfi_offset 17, -16 -; CHECK-LE-NEXT: .cfi_offset 16, -20 -; CHECK-LE-NEXT: move $16, $7 -; CHECK-LE-NEXT: move $17, $6 -; CHECK-LE-NEXT: move $18, $5 -; CHECK-LE-NEXT: move $19, $4 -; CHECK-LE-NEXT: lw $1, 60($sp) -; CHECK-LE-NEXT: andi $5, $1, 31 -; CHECK-LE-NEXT: lw $4, 56($sp) -; CHECK-LE-NEXT: addiu $6, $zero, 37 -; CHECK-LE-NEXT: jal __umoddi3 -; CHECK-LE-NEXT: addiu $7, $zero, 0 -; CHECK-LE-NEXT: addiu $1, $2, 27 +; CHECK-LE-NEXT: lui $1, 1771 +; CHECK-LE-NEXT: ori $1, $1, 15941 +; CHECK-LE-NEXT: lw $2, 16($sp) +; CHECK-LE-NEXT: multu $2, $1 +; CHECK-LE-NEXT: mfhi $3 +; CHECK-LE-NEXT: mflo $8 +; CHECK-LE-NEXT: lui $9, 12398 +; CHECK-LE-NEXT: ori $9, $9, 46053 +; CHECK-LE-NEXT: multu $2, $9 +; CHECK-LE-NEXT: mfhi $10 +; CHECK-LE-NEXT: lw $11, 20($sp) +; CHECK-LE-NEXT: andi $11, $11, 31 +; CHECK-LE-NEXT: multu $11, $9 +; CHECK-LE-NEXT: mflo $9 +; CHECK-LE-NEXT: mfhi $12 +; CHECK-LE-NEXT: addu $8, $10, $8 +; CHECK-LE-NEXT: sltu $10, $8, $10 +; CHECK-LE-NEXT: addu $9, $8, $9 +; CHECK-LE-NEXT: sltu $8, $9, $8 +; CHECK-LE-NEXT: addu $3, $3, $10 +; CHECK-LE-NEXT: srl $9, $6, 5 +; CHECK-LE-NEXT: sll $7, $7, 27 +; CHECK-LE-NEXT: or $7, $7, $9 +; CHECK-LE-NEXT: sll $6, $6, 27 +; CHECK-LE-NEXT: addu $3, $3, $12 +; CHECK-LE-NEXT: addu $3, $3, $8 +; CHECK-LE-NEXT: mul $1, $11, $1 +; CHECK-LE-NEXT: addu $1, $3, $1 +; CHECK-LE-NEXT: sll $3, $1, 2 +; CHECK-LE-NEXT: addu $3, $3, $1 +; CHECK-LE-NEXT: sll $1, $1, 5 +; CHECK-LE-NEXT: addu $1, $1, $3 +; CHECK-LE-NEXT: subu $1, $2, $1 +; CHECK-LE-NEXT: addiu $1, $1, 27 ; CHECK-LE-NEXT: andi $3, $1, 32 -; CHECK-LE-NEXT: srl $2, $17, 5 -; CHECK-LE-NEXT: sll $4, $16, 27 -; CHECK-LE-NEXT: or $2, $4, $2 -; CHECK-LE-NEXT: sll $4, $17, 27 -; CHECK-LE-NEXT: move $5, $19 -; CHECK-LE-NEXT: movz $5, $2, $3 -; CHECK-LE-NEXT: movz $2, $4, $3 -; CHECK-LE-NEXT: srlv $2, $2, $1 -; CHECK-LE-NEXT: not $4, $1 -; CHECK-LE-NEXT: sll $6, $5, 1 -; CHECK-LE-NEXT: sllv $6, $6, $4 -; CHECK-LE-NEXT: or $2, $6, $2 -; CHECK-LE-NEXT: srlv $1, $5, $1 -; CHECK-LE-NEXT: movz $18, $19, $3 -; CHECK-LE-NEXT: sll $3, $18, 1 -; CHECK-LE-NEXT: sllv $3, $3, $4 -; CHECK-LE-NEXT: or $3, $3, $1 -; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload -; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; CHECK-LE-NEXT: move $8, $4 +; CHECK-LE-NEXT: movz $8, $7, $3 +; CHECK-LE-NEXT: movz $7, $6, $3 +; CHECK-LE-NEXT: srlv $2, $7, $1 +; CHECK-LE-NEXT: not $6, $1 +; CHECK-LE-NEXT: sll $7, $8, 1 +; CHECK-LE-NEXT: sllv $7, $7, $6 +; CHECK-LE-NEXT: or $2, $7, $2 +; CHECK-LE-NEXT: srlv $1, $8, $1 +; CHECK-LE-NEXT: movz $5, $4, $3 +; CHECK-LE-NEXT: sll $3, $5, 1 +; CHECK-LE-NEXT: sllv $3, $3, $6 ; CHECK-LE-NEXT: jr $ra -; CHECK-LE-NEXT: addiu $sp, $sp, 40 +; CHECK-LE-NEXT: or $3, $3, $1 %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll index be95233656f47..952fede1d9b8d 100644 --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -270,116 +270,94 @@ declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; CHECK32_32-LABEL: fshl_i37: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: mflr 0 -; CHECK32_32-NEXT: stwu 1, -32(1) -; CHECK32_32-NEXT: stw 0, 36(1) -; CHECK32_32-NEXT: .cfi_def_cfa_offset 32 -; CHECK32_32-NEXT: .cfi_offset lr, 4 -; CHECK32_32-NEXT: .cfi_offset r27, -20 -; CHECK32_32-NEXT: .cfi_offset r28, -16 -; CHECK32_32-NEXT: .cfi_offset r29, -12 -; CHECK32_32-NEXT: .cfi_offset r30, -8 -; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 27, 5 -; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 28, 3 -; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 29, 4 -; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 30, 6 -; CHECK32_32-NEXT: clrlwi 3, 7, 27 -; CHECK32_32-NEXT: mr 4, 8 -; CHECK32_32-NEXT: li 5, 0 -; CHECK32_32-NEXT: li 6, 37 -; CHECK32_32-NEXT: bl __umoddi3 -; CHECK32_32-NEXT: rotlwi 5, 30, 27 -; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4 -; CHECK32_32-NEXT: andi. 3, 4, 32 -; CHECK32_32-NEXT: mr 6, 5 +; CHECK32_32-NEXT: lis 9, 1771 +; CHECK32_32-NEXT: lis 11, 12398 +; CHECK32_32-NEXT: ori 9, 9, 15941 +; CHECK32_32-NEXT: clrlwi 7, 7, 27 +; CHECK32_32-NEXT: ori 11, 11, 46053 +; CHECK32_32-NEXT: mulhwu 10, 8, 9 +; CHECK32_32-NEXT: mulhwu 12, 7, 11 +; CHECK32_32-NEXT: mullw 0, 8, 9 +; CHECK32_32-NEXT: mullw 9, 7, 9 +; CHECK32_32-NEXT: mullw 7, 7, 11 +; CHECK32_32-NEXT: mulhwu 11, 8, 11 +; CHECK32_32-NEXT: addc 11, 11, 0 +; CHECK32_32-NEXT: addze 10, 10 +; CHECK32_32-NEXT: addc 7, 11, 7 +; CHECK32_32-NEXT: adde 7, 10, 12 +; CHECK32_32-NEXT: add 7, 7, 9 +; CHECK32_32-NEXT: mulli 7, 7, 37 +; CHECK32_32-NEXT: sub 8, 8, 7 +; CHECK32_32-NEXT: andi. 7, 8, 32 +; CHECK32_32-NEXT: rotlwi 7, 6, 27 +; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4 +; CHECK32_32-NEXT: mr 5, 7 ; CHECK32_32-NEXT: bne 0, .LBB3_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: mr 6, 29 +; CHECK32_32-NEXT: mr 5, 4 ; CHECK32_32-NEXT: .LBB3_2: -; CHECK32_32-NEXT: clrlwi 4, 4, 27 -; CHECK32_32-NEXT: subfic 7, 4, 32 -; CHECK32_32-NEXT: srw 3, 6, 7 +; CHECK32_32-NEXT: clrlwi 8, 8, 27 +; CHECK32_32-NEXT: subfic 9, 8, 32 +; CHECK32_32-NEXT: srw 10, 5, 9 ; CHECK32_32-NEXT: bne 0, .LBB3_4 ; CHECK32_32-NEXT: # %bb.3: -; CHECK32_32-NEXT: mr 29, 28 +; CHECK32_32-NEXT: mr 4, 3 ; CHECK32_32-NEXT: .LBB3_4: -; CHECK32_32-NEXT: slw 8, 29, 4 -; CHECK32_32-NEXT: or 3, 8, 3 +; CHECK32_32-NEXT: slw 3, 4, 8 +; CHECK32_32-NEXT: or 3, 3, 10 ; CHECK32_32-NEXT: beq 0, .LBB3_6 ; CHECK32_32-NEXT: # %bb.5: -; CHECK32_32-NEXT: slwi 5, 30, 27 +; CHECK32_32-NEXT: slwi 7, 6, 27 ; CHECK32_32-NEXT: .LBB3_6: -; CHECK32_32-NEXT: srw 5, 5, 7 -; CHECK32_32-NEXT: slw 4, 6, 4 -; CHECK32_32-NEXT: or 4, 4, 5 -; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 0, 36(1) -; CHECK32_32-NEXT: addi 1, 1, 32 -; CHECK32_32-NEXT: mtlr 0 +; CHECK32_32-NEXT: srw 4, 7, 9 +; CHECK32_32-NEXT: slw 5, 5, 8 +; CHECK32_32-NEXT: or 4, 5, 4 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: fshl_i37: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: mflr 0 -; CHECK32_64-NEXT: stwu 1, -32(1) -; CHECK32_64-NEXT: stw 0, 36(1) -; CHECK32_64-NEXT: .cfi_def_cfa_offset 32 -; CHECK32_64-NEXT: .cfi_offset lr, 4 -; CHECK32_64-NEXT: .cfi_offset r27, -20 -; CHECK32_64-NEXT: .cfi_offset r28, -16 -; CHECK32_64-NEXT: .cfi_offset r29, -12 -; CHECK32_64-NEXT: .cfi_offset r30, -8 -; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 27, 5 -; CHECK32_64-NEXT: li 5, 0 -; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 28, 3 -; CHECK32_64-NEXT: clrlwi 3, 7, 27 -; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 29, 4 -; CHECK32_64-NEXT: mr 4, 8 -; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 30, 6 -; CHECK32_64-NEXT: li 6, 37 -; CHECK32_64-NEXT: bl __umoddi3 -; CHECK32_64-NEXT: rotlwi 5, 30, 27 -; CHECK32_64-NEXT: andi. 3, 4, 32 -; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4 -; CHECK32_64-NEXT: mr 6, 5 +; CHECK32_64-NEXT: lis 9, 1771 +; CHECK32_64-NEXT: lis 12, 12398 +; CHECK32_64-NEXT: ori 9, 9, 15941 +; CHECK32_64-NEXT: clrlwi 7, 7, 27 +; CHECK32_64-NEXT: ori 12, 12, 46053 +; CHECK32_64-NEXT: mulhwu 10, 8, 9 +; CHECK32_64-NEXT: mullw 11, 8, 9 +; CHECK32_64-NEXT: mulhwu 0, 7, 12 +; CHECK32_64-NEXT: mullw 9, 7, 9 +; CHECK32_64-NEXT: mullw 7, 7, 12 +; CHECK32_64-NEXT: mulhwu 12, 8, 12 +; CHECK32_64-NEXT: addc 11, 12, 11 +; CHECK32_64-NEXT: addze 10, 10 +; CHECK32_64-NEXT: addc 7, 11, 7 +; CHECK32_64-NEXT: adde 7, 10, 0 +; CHECK32_64-NEXT: add 7, 7, 9 +; CHECK32_64-NEXT: mulli 7, 7, 37 +; CHECK32_64-NEXT: sub 8, 8, 7 +; CHECK32_64-NEXT: andi. 7, 8, 32 +; CHECK32_64-NEXT: rotlwi 7, 6, 27 +; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4 +; CHECK32_64-NEXT: mr 5, 7 ; CHECK32_64-NEXT: bne 0, .LBB3_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: mr 6, 29 +; CHECK32_64-NEXT: mr 5, 4 ; CHECK32_64-NEXT: .LBB3_2: -; CHECK32_64-NEXT: clrlwi 4, 4, 27 -; CHECK32_64-NEXT: subfic 7, 4, 32 -; CHECK32_64-NEXT: srw 3, 6, 7 +; CHECK32_64-NEXT: clrlwi 8, 8, 27 +; CHECK32_64-NEXT: subfic 9, 8, 32 +; CHECK32_64-NEXT: srw 10, 5, 9 ; CHECK32_64-NEXT: bne 0, .LBB3_4 ; CHECK32_64-NEXT: # %bb.3: -; CHECK32_64-NEXT: mr 29, 28 +; CHECK32_64-NEXT: mr 4, 3 ; CHECK32_64-NEXT: .LBB3_4: -; CHECK32_64-NEXT: slw 8, 29, 4 -; CHECK32_64-NEXT: or 3, 8, 3 +; CHECK32_64-NEXT: slw 3, 4, 8 +; CHECK32_64-NEXT: or 3, 3, 10 ; CHECK32_64-NEXT: beq 0, .LBB3_6 ; CHECK32_64-NEXT: # %bb.5: -; CHECK32_64-NEXT: slwi 5, 30, 27 +; CHECK32_64-NEXT: slwi 7, 6, 27 ; CHECK32_64-NEXT: .LBB3_6: -; CHECK32_64-NEXT: srw 5, 5, 7 -; CHECK32_64-NEXT: slw 4, 6, 4 -; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 4, 4, 5 -; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 0, 36(1) -; CHECK32_64-NEXT: addi 1, 1, 32 -; CHECK32_64-NEXT: mtlr 0 +; CHECK32_64-NEXT: srw 4, 7, 9 +; CHECK32_64-NEXT: slw 5, 5, 8 +; CHECK32_64-NEXT: or 4, 5, 4 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: fshl_i37: @@ -536,118 +514,96 @@ declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK32_32-LABEL: fshr_i37: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: mflr 0 -; CHECK32_32-NEXT: stwu 1, -32(1) -; CHECK32_32-NEXT: stw 0, 36(1) -; CHECK32_32-NEXT: .cfi_def_cfa_offset 32 -; CHECK32_32-NEXT: .cfi_offset lr, 4 -; CHECK32_32-NEXT: .cfi_offset r27, -20 -; CHECK32_32-NEXT: .cfi_offset r28, -16 -; CHECK32_32-NEXT: .cfi_offset r29, -12 -; CHECK32_32-NEXT: .cfi_offset r30, -8 -; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 27, 5 -; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 28, 3 -; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 29, 4 -; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill -; CHECK32_32-NEXT: mr 30, 6 -; CHECK32_32-NEXT: clrlwi 3, 7, 27 -; CHECK32_32-NEXT: mr 4, 8 -; CHECK32_32-NEXT: li 5, 0 -; CHECK32_32-NEXT: li 6, 37 -; CHECK32_32-NEXT: bl __umoddi3 -; CHECK32_32-NEXT: rotlwi 5, 30, 27 -; CHECK32_32-NEXT: addi 3, 4, 27 -; CHECK32_32-NEXT: andi. 4, 3, 32 -; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4 -; CHECK32_32-NEXT: mr 4, 5 +; CHECK32_32-NEXT: lis 9, 1771 +; CHECK32_32-NEXT: lis 11, 12398 +; CHECK32_32-NEXT: ori 9, 9, 15941 +; CHECK32_32-NEXT: clrlwi 7, 7, 27 +; CHECK32_32-NEXT: ori 11, 11, 46053 +; CHECK32_32-NEXT: mulhwu 10, 8, 9 +; CHECK32_32-NEXT: mulhwu 12, 7, 11 +; CHECK32_32-NEXT: mullw 0, 8, 9 +; CHECK32_32-NEXT: mullw 9, 7, 9 +; CHECK32_32-NEXT: mullw 7, 7, 11 +; CHECK32_32-NEXT: mulhwu 11, 8, 11 +; CHECK32_32-NEXT: addc 11, 11, 0 +; CHECK32_32-NEXT: addze 10, 10 +; CHECK32_32-NEXT: addc 7, 11, 7 +; CHECK32_32-NEXT: adde 7, 10, 12 +; CHECK32_32-NEXT: add 7, 7, 9 +; CHECK32_32-NEXT: mulli 7, 7, 37 +; CHECK32_32-NEXT: sub 7, 8, 7 +; CHECK32_32-NEXT: addi 8, 7, 27 +; CHECK32_32-NEXT: andi. 7, 8, 32 +; CHECK32_32-NEXT: rotlwi 7, 6, 27 +; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4 +; CHECK32_32-NEXT: mr 5, 7 ; CHECK32_32-NEXT: beq 0, .LBB11_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: mr 4, 29 +; CHECK32_32-NEXT: mr 5, 4 ; CHECK32_32-NEXT: .LBB11_2: -; CHECK32_32-NEXT: clrlwi 6, 3, 27 -; CHECK32_32-NEXT: srw 3, 4, 6 +; CHECK32_32-NEXT: clrlwi 8, 8, 27 +; CHECK32_32-NEXT: srw 10, 5, 8 ; CHECK32_32-NEXT: beq 0, .LBB11_4 ; CHECK32_32-NEXT: # %bb.3: -; CHECK32_32-NEXT: mr 29, 28 +; CHECK32_32-NEXT: mr 4, 3 ; CHECK32_32-NEXT: .LBB11_4: -; CHECK32_32-NEXT: subfic 7, 6, 32 -; CHECK32_32-NEXT: slw 8, 29, 7 -; CHECK32_32-NEXT: or 3, 8, 3 +; CHECK32_32-NEXT: subfic 9, 8, 32 +; CHECK32_32-NEXT: slw 3, 4, 9 +; CHECK32_32-NEXT: or 3, 3, 10 ; CHECK32_32-NEXT: bne 0, .LBB11_6 ; CHECK32_32-NEXT: # %bb.5: -; CHECK32_32-NEXT: slwi 5, 30, 27 +; CHECK32_32-NEXT: slwi 7, 6, 27 ; CHECK32_32-NEXT: .LBB11_6: -; CHECK32_32-NEXT: srw 5, 5, 6 -; CHECK32_32-NEXT: slw 4, 4, 7 -; CHECK32_32-NEXT: or 4, 4, 5 -; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_32-NEXT: lwz 0, 36(1) -; CHECK32_32-NEXT: addi 1, 1, 32 -; CHECK32_32-NEXT: mtlr 0 +; CHECK32_32-NEXT: srw 4, 7, 8 +; CHECK32_32-NEXT: slw 5, 5, 9 +; CHECK32_32-NEXT: or 4, 5, 4 ; CHECK32_32-NEXT: blr ; ; CHECK32_64-LABEL: fshr_i37: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: mflr 0 -; CHECK32_64-NEXT: stwu 1, -32(1) -; CHECK32_64-NEXT: stw 0, 36(1) -; CHECK32_64-NEXT: .cfi_def_cfa_offset 32 -; CHECK32_64-NEXT: .cfi_offset lr, 4 -; CHECK32_64-NEXT: .cfi_offset r27, -20 -; CHECK32_64-NEXT: .cfi_offset r28, -16 -; CHECK32_64-NEXT: .cfi_offset r29, -12 -; CHECK32_64-NEXT: .cfi_offset r30, -8 -; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 27, 5 -; CHECK32_64-NEXT: li 5, 0 -; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 28, 3 -; CHECK32_64-NEXT: clrlwi 3, 7, 27 -; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 29, 4 -; CHECK32_64-NEXT: mr 4, 8 -; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mr 30, 6 -; CHECK32_64-NEXT: li 6, 37 -; CHECK32_64-NEXT: bl __umoddi3 -; CHECK32_64-NEXT: rotlwi 5, 30, 27 -; CHECK32_64-NEXT: addi 3, 4, 27 -; CHECK32_64-NEXT: andi. 4, 3, 32 -; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4 -; CHECK32_64-NEXT: mr 4, 5 +; CHECK32_64-NEXT: lis 9, 1771 +; CHECK32_64-NEXT: lis 12, 12398 +; CHECK32_64-NEXT: ori 9, 9, 15941 +; CHECK32_64-NEXT: clrlwi 7, 7, 27 +; CHECK32_64-NEXT: ori 12, 12, 46053 +; CHECK32_64-NEXT: mulhwu 10, 8, 9 +; CHECK32_64-NEXT: mullw 11, 8, 9 +; CHECK32_64-NEXT: mulhwu 0, 7, 12 +; CHECK32_64-NEXT: mullw 9, 7, 9 +; CHECK32_64-NEXT: mullw 7, 7, 12 +; CHECK32_64-NEXT: mulhwu 12, 8, 12 +; CHECK32_64-NEXT: addc 11, 12, 11 +; CHECK32_64-NEXT: addze 10, 10 +; CHECK32_64-NEXT: addc 7, 11, 7 +; CHECK32_64-NEXT: adde 7, 10, 0 +; CHECK32_64-NEXT: add 7, 7, 9 +; CHECK32_64-NEXT: mulli 7, 7, 37 +; CHECK32_64-NEXT: sub 7, 8, 7 +; CHECK32_64-NEXT: addi 8, 7, 27 +; CHECK32_64-NEXT: andi. 7, 8, 32 +; CHECK32_64-NEXT: rotlwi 7, 6, 27 +; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4 +; CHECK32_64-NEXT: mr 5, 7 ; CHECK32_64-NEXT: beq 0, .LBB11_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: mr 4, 29 +; CHECK32_64-NEXT: mr 5, 4 ; CHECK32_64-NEXT: .LBB11_2: -; CHECK32_64-NEXT: clrlwi 6, 3, 27 -; CHECK32_64-NEXT: srw 3, 4, 6 +; CHECK32_64-NEXT: clrlwi 8, 8, 27 +; CHECK32_64-NEXT: srw 10, 5, 8 ; CHECK32_64-NEXT: beq 0, .LBB11_4 ; CHECK32_64-NEXT: # %bb.3: -; CHECK32_64-NEXT: mr 29, 28 +; CHECK32_64-NEXT: mr 4, 3 ; CHECK32_64-NEXT: .LBB11_4: -; CHECK32_64-NEXT: subfic 7, 6, 32 -; CHECK32_64-NEXT: slw 8, 29, 7 -; CHECK32_64-NEXT: or 3, 8, 3 +; CHECK32_64-NEXT: subfic 9, 8, 32 +; CHECK32_64-NEXT: slw 3, 4, 9 +; CHECK32_64-NEXT: or 3, 3, 10 ; CHECK32_64-NEXT: bne 0, .LBB11_6 ; CHECK32_64-NEXT: # %bb.5: -; CHECK32_64-NEXT: slwi 5, 30, 27 +; CHECK32_64-NEXT: slwi 7, 6, 27 ; CHECK32_64-NEXT: .LBB11_6: -; CHECK32_64-NEXT: srw 5, 5, 6 -; CHECK32_64-NEXT: slw 4, 4, 7 -; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: or 4, 4, 5 -; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: lwz 0, 36(1) -; CHECK32_64-NEXT: addi 1, 1, 32 -; CHECK32_64-NEXT: mtlr 0 +; CHECK32_64-NEXT: srw 4, 7, 8 +; CHECK32_64-NEXT: slw 5, 5, 9 +; CHECK32_64-NEXT: or 4, 5, 4 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: fshr_i37: diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll index 03fd0c0c7e8e2..c6e623938d819 100644 --- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -86,23 +86,85 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) { } define i64 @fold_urem_i64(i64 %x) { -; CHECK-LABEL: fold_urem_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: stw 0, 20(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __umoddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 -; CHECK-NEXT: blr +; PPC32-LABEL: fold_urem_i64: +; PPC32: # %bb.0: +; PPC32-NEXT: lis 5, 21399 +; PPC32-NEXT: lis 8, -17388 +; PPC32-NEXT: rotlwi 11, 4, 31 +; PPC32-NEXT: ori 5, 5, 33436 +; PPC32-NEXT: srwi 6, 3, 1 +; PPC32-NEXT: ori 8, 8, 58849 +; PPC32-NEXT: rlwimi 11, 3, 31, 0, 0 +; PPC32-NEXT: mulhwu 7, 6, 5 +; PPC32-NEXT: mulhwu 9, 6, 8 +; PPC32-NEXT: mullw 10, 6, 8 +; PPC32-NEXT: mullw 6, 6, 5 +; PPC32-NEXT: mulhwu 12, 11, 5 +; PPC32-NEXT: mullw 5, 11, 5 +; PPC32-NEXT: mulhwu 8, 11, 8 +; PPC32-NEXT: addc 5, 8, 5 +; PPC32-NEXT: addze 11, 12 +; PPC32-NEXT: addc 5, 5, 10 +; PPC32-NEXT: adde 5, 11, 9 +; PPC32-NEXT: addze 7, 7 +; PPC32-NEXT: addc 5, 5, 6 +; PPC32-NEXT: addze 6, 7 +; PPC32-NEXT: rotlwi 5, 5, 28 +; PPC32-NEXT: li 8, 98 +; PPC32-NEXT: rlwimi 5, 6, 28, 0, 3 +; PPC32-NEXT: mulhwu 7, 5, 8 +; PPC32-NEXT: mulli 5, 5, 98 +; PPC32-NEXT: subc 4, 4, 5 +; PPC32-NEXT: li 5, 0 +; PPC32-NEXT: srwi 6, 6, 4 +; PPC32-NEXT: addze 5, 5 +; PPC32-NEXT: mulli 6, 6, 98 +; PPC32-NEXT: cntlzw 5, 5 +; PPC32-NEXT: rlwinm 5, 5, 27, 31, 31 +; PPC32-NEXT: add 6, 7, 6 +; PPC32-NEXT: sub 3, 3, 5 +; PPC32-NEXT: sub 3, 3, 6 +; PPC32-NEXT: blr +; +; PPC64-LABEL: fold_urem_i64: +; PPC64: # %bb.0: +; PPC64-NEXT: lis 5, 21399 +; PPC64-NEXT: lis 8, -17388 +; PPC64-NEXT: rotlwi 10, 4, 31 +; PPC64-NEXT: ori 5, 5, 33436 +; PPC64-NEXT: srwi 6, 3, 1 +; PPC64-NEXT: ori 8, 8, 58849 +; PPC64-NEXT: rlwimi 10, 3, 31, 0, 0 +; PPC64-NEXT: mulhwu 7, 6, 5 +; PPC64-NEXT: mulhwu 9, 6, 8 +; PPC64-NEXT: mulhwu 11, 10, 5 +; PPC64-NEXT: mullw 12, 6, 8 +; PPC64-NEXT: mullw 6, 6, 5 +; PPC64-NEXT: mullw 5, 10, 5 +; PPC64-NEXT: mulhwu 8, 10, 8 +; PPC64-NEXT: addc 5, 8, 5 +; PPC64-NEXT: addze 10, 11 +; PPC64-NEXT: addc 5, 5, 12 +; PPC64-NEXT: adde 5, 10, 9 +; PPC64-NEXT: addze 7, 7 +; PPC64-NEXT: addc 5, 5, 6 +; PPC64-NEXT: addze 6, 7 +; PPC64-NEXT: rotlwi 5, 5, 28 +; PPC64-NEXT: li 8, 98 +; PPC64-NEXT: rlwimi 5, 6, 28, 0, 3 +; PPC64-NEXT: mulhwu 7, 5, 8 +; PPC64-NEXT: srwi 6, 6, 4 +; PPC64-NEXT: mulli 5, 5, 98 +; PPC64-NEXT: subc 4, 4, 5 +; PPC64-NEXT: li 5, 0 +; PPC64-NEXT: addze 5, 5 +; PPC64-NEXT: cntlzw 5, 5 +; PPC64-NEXT: mulli 6, 6, 98 +; PPC64-NEXT: rlwinm 5, 5, 27, 31, 31 +; PPC64-NEXT: add 6, 7, 6 +; PPC64-NEXT: sub 3, 3, 5 +; PPC64-NEXT: sub 3, 3, 6 +; PPC64-NEXT: blr %1 = urem i64 %x, 98 ret i64 %1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; PPC32: {{.*}} -; PPC64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll index 53c3f5841ba0f..6a04dcd7fb069 100644 --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -117,13 +117,48 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind { define i64 @udiv64_constant_add(i64 %a) nounwind { ; RV32-LABEL: udiv64_constant_add: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 149797 +; RV32-NEXT: lui a3, 599186 +; RV32-NEXT: addi a2, a2, -1756 +; RV32-NEXT: addi a3, a3, 1171 +; RV32-NEXT: mul a4, a0, a2 +; RV32-NEXT: mulhu a5, a0, a3 +; RV32-NEXT: mul a6, a1, a3 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: mulhu a3, a1, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: add a6, a4, a6 +; RV32-NEXT: sltu a5, a4, a5 +; RV32-NEXT: sltu a4, a6, a4 +; RV32-NEXT: mul a6, a1, a2 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: add a5, a7, a5 +; RV32-NEXT: add a3, a5, a3 +; RV32-NEXT: add a7, a3, a4 +; RV32-NEXT: sltu a3, a3, a5 +; RV32-NEXT: seqz a5, a7 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: sltu a5, a6, a7 +; RV32-NEXT: sub a7, a0, a6 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: sltu a0, a0, a7 +; RV32-NEXT: srli a4, a7, 1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a0, a1, 31 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a6, a0, a6 +; RV32-NEXT: sltu a0, a6, a0 +; RV32-NEXT: srli a2, a6, 2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: udiv64_constant_add: diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll index eb70d7f43c0ef..96250a9c88240 100644 --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -117,24 +117,94 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind { define iXLen2 @test_udiv_7(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_7: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 149797 +; RV32-NEXT: lui a3, 599186 +; RV32-NEXT: addi a2, a2, -1756 +; RV32-NEXT: addi a3, a3, 1171 +; RV32-NEXT: mul a4, a0, a2 +; RV32-NEXT: mulhu a5, a0, a3 +; RV32-NEXT: mul a6, a1, a3 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: mulhu a3, a1, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: add a6, a4, a6 +; RV32-NEXT: sltu a5, a4, a5 +; RV32-NEXT: sltu a4, a6, a4 +; RV32-NEXT: mul a6, a1, a2 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: add a5, a7, a5 +; RV32-NEXT: add a3, a5, a3 +; RV32-NEXT: add a7, a3, a4 +; RV32-NEXT: sltu a3, a3, a5 +; RV32-NEXT: seqz a5, a7 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: sltu a5, a6, a7 +; RV32-NEXT: sub a7, a0, a6 +; RV32-NEXT: or a3, a3, a4 +; RV32-NEXT: add a2, a5, a2 +; RV32-NEXT: sltu a0, a0, a7 +; RV32-NEXT: srli a4, a7, 1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a0, a1, 31 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a6, a0, a6 +; RV32-NEXT: sltu a0, a6, a0 +; RV32-NEXT: srli a2, a6, 2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a1, 30 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_7: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 7 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI2_0) +; RV64-NEXT: lui a3, %hi(.LCPI2_1) +; RV64-NEXT: ld a2, %lo(.LCPI2_0)(a2) +; RV64-NEXT: ld a3, %lo(.LCPI2_1)(a3) +; RV64-NEXT: mul a4, a0, a2 +; RV64-NEXT: mulhu a5, a0, a3 +; RV64-NEXT: mul a6, a1, a3 +; RV64-NEXT: mulhu a7, a0, a2 +; RV64-NEXT: mulhu a3, a1, a3 +; RV64-NEXT: add a4, a5, a4 +; RV64-NEXT: add a6, a4, a6 +; RV64-NEXT: sltu a5, a4, a5 +; RV64-NEXT: sltu a4, a6, a4 +; RV64-NEXT: mul a6, a1, a2 +; RV64-NEXT: mulhu a2, a1, a2 +; RV64-NEXT: add a5, a7, a5 +; RV64-NEXT: add a3, a5, a3 +; RV64-NEXT: add a7, a3, a4 +; RV64-NEXT: sltu a3, a3, a5 +; RV64-NEXT: seqz a5, a7 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: and a4, a5, a4 +; RV64-NEXT: sltu a5, a6, a7 +; RV64-NEXT: sub a7, a0, a6 +; RV64-NEXT: or a3, a3, a4 +; RV64-NEXT: add a2, a5, a2 +; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: srli a4, a7, 1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: slli a0, a1, 63 +; RV64-NEXT: srli a1, a1, 1 +; RV64-NEXT: or a0, a0, a4 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: add a6, a0, a6 +; RV64-NEXT: sltu a0, a6, a0 +; RV64-NEXT: srli a2, a6, 2 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: slli a0, a1, 62 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 7 ret iXLen2 %a @@ -143,24 +213,70 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind { define iXLen2 @test_udiv_9(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_9: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 9 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 932068 +; RV32-NEXT: lui a3, 582542 +; RV32-NEXT: addi a2, a2, -1821 +; RV32-NEXT: addi a3, a3, 911 +; RV32-NEXT: mul a4, a0, a2 +; RV32-NEXT: mulhu a5, a0, a3 +; RV32-NEXT: mul a6, a1, a3 +; RV32-NEXT: mulhu a0, a0, a2 +; RV32-NEXT: mulhu a3, a1, a3 +; RV32-NEXT: mul a7, a1, a2 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: add a6, a4, a6 +; RV32-NEXT: sltu a2, a4, a5 +; RV32-NEXT: sltu a4, a6, a4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a3, a0, a3 +; RV32-NEXT: add a2, a3, a4 +; RV32-NEXT: sltu a0, a3, a0 +; RV32-NEXT: seqz a3, a2 +; RV32-NEXT: add a7, a2, a7 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: sltu a2, a7, a2 +; RV32-NEXT: srli a4, a7, 3 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a1, 29 +; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_9: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 9 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI3_0) +; RV64-NEXT: lui a3, %hi(.LCPI3_1) +; RV64-NEXT: ld a2, %lo(.LCPI3_0)(a2) +; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3) +; RV64-NEXT: mul a4, a0, a2 +; RV64-NEXT: mulhu a5, a0, a3 +; RV64-NEXT: mul a6, a1, a3 +; RV64-NEXT: mulhu a0, a0, a2 +; RV64-NEXT: mulhu a3, a1, a3 +; RV64-NEXT: mul a7, a1, a2 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: add a4, a5, a4 +; RV64-NEXT: add a6, a4, a6 +; RV64-NEXT: sltu a2, a4, a5 +; RV64-NEXT: sltu a4, a6, a4 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a3, a0, a3 +; RV64-NEXT: add a2, a3, a4 +; RV64-NEXT: sltu a0, a3, a0 +; RV64-NEXT: seqz a3, a2 +; RV64-NEXT: add a7, a2, a7 +; RV64-NEXT: and a3, a3, a4 +; RV64-NEXT: sltu a2, a7, a2 +; RV64-NEXT: srli a4, a7, 1 +; RV64-NEXT: or a0, a0, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: slli a0, a1, 63 +; RV64-NEXT: or a0, a0, a4 +; RV64-NEXT: srli a1, a1, 1 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 9 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll index bc4a99a00ac64..3f84e446166aa 100644 --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -79,24 +79,118 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind { define iXLen2 @test_urem_7(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_7: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 149797 +; RV32-NEXT: lui a3, 599186 +; RV32-NEXT: li a4, 7 +; RV32-NEXT: addi a2, a2, -1756 +; RV32-NEXT: addi a3, a3, 1171 +; RV32-NEXT: mul a5, a0, a2 +; RV32-NEXT: mulhu a6, a0, a3 +; RV32-NEXT: mul a7, a1, a3 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: add a7, a5, a7 +; RV32-NEXT: sltu a6, a5, a6 +; RV32-NEXT: sltu a5, a7, a5 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: mulhu a3, a1, a3 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a7, a3, a5 +; RV32-NEXT: sltu a3, a3, a6 +; RV32-NEXT: seqz a6, a7 +; RV32-NEXT: and a5, a6, a5 +; RV32-NEXT: mul a6, a1, a2 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: sltu a7, a6, a7 +; RV32-NEXT: or a3, a3, a5 +; RV32-NEXT: sub a5, a0, a6 +; RV32-NEXT: add a2, a7, a2 +; RV32-NEXT: sltu a7, a0, a5 +; RV32-NEXT: srli a5, a5, 1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a3, a1, a7 +; RV32-NEXT: sub a3, a3, a2 +; RV32-NEXT: slli a7, a3, 31 +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: or a5, a7, a5 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a6, a5, a6 +; RV32-NEXT: sltu a3, a6, a5 +; RV32-NEXT: srli a5, a6, 2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: slli a3, a5, 3 +; RV32-NEXT: srli a6, a2, 2 +; RV32-NEXT: slli a2, a2, 30 +; RV32-NEXT: sub a3, a0, a3 +; RV32-NEXT: slli a7, a6, 3 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: sub a5, a7, a6 +; RV32-NEXT: mulhu a4, a2, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a1, a1, a4 +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_7: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 7 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI2_0) +; RV64-NEXT: lui a3, %hi(.LCPI2_1) +; RV64-NEXT: ld a2, %lo(.LCPI2_0)(a2) +; RV64-NEXT: ld a3, %lo(.LCPI2_1)(a3) +; RV64-NEXT: li a4, 7 +; RV64-NEXT: mul a5, a0, a2 +; RV64-NEXT: mulhu a6, a0, a3 +; RV64-NEXT: mul a7, a1, a3 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: add a7, a5, a7 +; RV64-NEXT: sltu a6, a5, a6 +; RV64-NEXT: sltu a5, a7, a5 +; RV64-NEXT: mulhu a7, a0, a2 +; RV64-NEXT: mulhu a3, a1, a3 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: add a3, a6, a3 +; RV64-NEXT: add a7, a3, a5 +; RV64-NEXT: sltu a3, a3, a6 +; RV64-NEXT: seqz a6, a7 +; RV64-NEXT: and a5, a6, a5 +; RV64-NEXT: mul a6, a1, a2 +; RV64-NEXT: mulhu a2, a1, a2 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: sltu a7, a6, a7 +; RV64-NEXT: or a3, a3, a5 +; RV64-NEXT: sub a5, a0, a6 +; RV64-NEXT: add a2, a7, a2 +; RV64-NEXT: sltu a7, a0, a5 +; RV64-NEXT: srli a5, a5, 1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a3, a1, a7 +; RV64-NEXT: sub a3, a3, a2 +; RV64-NEXT: slli a7, a3, 63 +; RV64-NEXT: srli a3, a3, 1 +; RV64-NEXT: or a5, a7, a5 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a6, a5, a6 +; RV64-NEXT: sltu a3, a6, a5 +; RV64-NEXT: srli a5, a6, 2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: slli a3, a5, 3 +; RV64-NEXT: srli a6, a2, 2 +; RV64-NEXT: slli a2, a2, 62 +; RV64-NEXT: sub a3, a0, a3 +; RV64-NEXT: slli a7, a6, 3 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: sub a5, a7, a6 +; RV64-NEXT: mulhu a4, a2, a4 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sub a1, a1, a4 +; RV64-NEXT: mv a0, a2 ; RV64-NEXT: ret %a = urem iXLen2 %x, 7 ret iXLen2 %a @@ -105,24 +199,94 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind { define iXLen2 @test_urem_9(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_9: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 9 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 932068 +; RV32-NEXT: lui a3, 582542 +; RV32-NEXT: li a4, 9 +; RV32-NEXT: addi a2, a2, -1821 +; RV32-NEXT: addi a3, a3, 911 +; RV32-NEXT: mul a5, a0, a2 +; RV32-NEXT: mulhu a6, a0, a3 +; RV32-NEXT: mul a7, a1, a3 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: add a7, a5, a7 +; RV32-NEXT: sltu a6, a5, a6 +; RV32-NEXT: sltu a5, a7, a5 +; RV32-NEXT: mulhu a7, a0, a2 +; RV32-NEXT: mulhu a3, a1, a3 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: add a7, a3, a5 +; RV32-NEXT: sltu a3, a3, a6 +; RV32-NEXT: seqz a6, a7 +; RV32-NEXT: and a5, a6, a5 +; RV32-NEXT: mul a6, a1, a2 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: sltu a7, a6, a7 +; RV32-NEXT: or a3, a3, a5 +; RV32-NEXT: srli a5, a6, 3 +; RV32-NEXT: andi a6, a6, -8 +; RV32-NEXT: add a2, a7, a2 +; RV32-NEXT: sub a6, a0, a6 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: srli a3, a2, 3 +; RV32-NEXT: andi a7, a2, -8 +; RV32-NEXT: slli a2, a2, 29 +; RV32-NEXT: add a3, a7, a3 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: mulhu a4, a2, a4 +; RV32-NEXT: sub a2, a6, a2 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_9: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 9 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI3_0) +; RV64-NEXT: lui a3, %hi(.LCPI3_1) +; RV64-NEXT: ld a2, %lo(.LCPI3_0)(a2) +; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3) +; RV64-NEXT: li a4, 9 +; RV64-NEXT: mul a5, a0, a2 +; RV64-NEXT: mulhu a6, a0, a3 +; RV64-NEXT: mul a7, a1, a3 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: add a7, a5, a7 +; RV64-NEXT: sltu a6, a5, a6 +; RV64-NEXT: sltu a5, a7, a5 +; RV64-NEXT: mulhu a7, a0, a2 +; RV64-NEXT: mulhu a3, a1, a3 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: add a3, a6, a3 +; RV64-NEXT: add a7, a3, a5 +; RV64-NEXT: sltu a3, a3, a6 +; RV64-NEXT: seqz a6, a7 +; RV64-NEXT: and a5, a6, a5 +; RV64-NEXT: mul a6, a1, a2 +; RV64-NEXT: mulhu a2, a1, a2 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: sltu a7, a6, a7 +; RV64-NEXT: srli a6, a6, 1 +; RV64-NEXT: or a3, a3, a5 +; RV64-NEXT: add a2, a7, a2 +; RV64-NEXT: slli a5, a6, 3 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a3, a0, a5 +; RV64-NEXT: srli a5, a2, 1 +; RV64-NEXT: slli a2, a2, 63 +; RV64-NEXT: slli a7, a5, 3 +; RV64-NEXT: or a2, a2, a6 +; RV64-NEXT: add a5, a7, a5 +; RV64-NEXT: mulhu a4, a2, a4 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sub a1, a1, a4 +; RV64-NEXT: mv a0, a2 ; RV64-NEXT: ret %a = urem iXLen2 %x, 9 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index 017b2d36bdd58..ad6beaf47ed3e 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -232,13 +232,50 @@ define i64 @fold_urem_i64(i64 %x) nounwind { ; ; RV32IM-LABEL: fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IM-NEXT: li a2, 98 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: srli a2, a0, 1 +; RV32IM-NEXT: slli a3, a1, 31 +; RV32IM-NEXT: lui a4, 342392 +; RV32IM-NEXT: lui a5, 770382 +; RV32IM-NEXT: srli a6, a1, 1 +; RV32IM-NEXT: or a2, a3, a2 +; RV32IM-NEXT: addi a3, a4, 668 +; RV32IM-NEXT: addi a4, a5, 1505 +; RV32IM-NEXT: mul a5, a2, a3 +; RV32IM-NEXT: mulhu a7, a2, a4 +; RV32IM-NEXT: mul t0, a6, a4 +; RV32IM-NEXT: mulhu a2, a2, a3 +; RV32IM-NEXT: mulhu a4, a6, a4 +; RV32IM-NEXT: mul t1, a6, a3 +; RV32IM-NEXT: mulhu a3, a6, a3 +; RV32IM-NEXT: add a5, a7, a5 +; RV32IM-NEXT: add t0, a5, t0 +; RV32IM-NEXT: sltu a6, a5, a7 +; RV32IM-NEXT: sltu a5, t0, a5 +; RV32IM-NEXT: li a7, 98 +; RV32IM-NEXT: add a2, a2, a6 +; RV32IM-NEXT: add a4, a2, a4 +; RV32IM-NEXT: add a6, a4, a5 +; RV32IM-NEXT: sltu a2, a4, a2 +; RV32IM-NEXT: seqz a4, a6 +; RV32IM-NEXT: add t1, a6, t1 +; RV32IM-NEXT: and a4, a4, a5 +; RV32IM-NEXT: sltu a5, t1, a6 +; RV32IM-NEXT: srli a6, t1, 4 +; RV32IM-NEXT: or a2, a2, a4 +; RV32IM-NEXT: add a3, a5, a3 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: srli a3, a2, 4 +; RV32IM-NEXT: slli a2, a2, 28 +; RV32IM-NEXT: mul a3, a3, a7 +; RV32IM-NEXT: or a2, a2, a6 +; RV32IM-NEXT: mulhu a4, a2, a7 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: sub a2, a0, a2 +; RV32IM-NEXT: sltu a0, a0, a2 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: mv a0, a2 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index ec97e7a0ae558..e23ba1628bc9c 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -862,50 +862,162 @@ define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s1, 16(a1) -; RV32IM-NEXT: lw s2, 20(a1) -; RV32IM-NEXT: lw s3, 24(a1) -; RV32IM-NEXT: lw s4, 28(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a4, 4(a1) -; RV32IM-NEXT: lw s5, 8(a1) -; RV32IM-NEXT: lw s6, 12(a1) +; RV32IM-NEXT: sw s9, 4(sp) # 4-byte Folded Spill +; RV32IM-NEXT: mv a2, a1 ; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: lw a4, 16(a1) +; RV32IM-NEXT: lw a3, 20(a1) +; RV32IM-NEXT: lw a6, 24(a1) +; RV32IM-NEXT: lw a7, 28(a1) +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw a5, 8(a2) +; RV32IM-NEXT: lw a2, 12(a2) +; RV32IM-NEXT: lui t0, 410312 +; RV32IM-NEXT: lui t1, 729444 +; RV32IM-NEXT: lui t2, 410452 +; RV32IM-NEXT: lui t4, 25653 +; RV32IM-NEXT: lui t5, 791991 +; RV32IM-NEXT: lui t6, 834723 +; RV32IM-NEXT: addi t3, t0, 1424 +; RV32IM-NEXT: addi s1, t1, 713 +; RV32IM-NEXT: addi s2, t2, -952 +; RV32IM-NEXT: addi t4, t4, 965 +; RV32IM-NEXT: addi t2, t5, 77 +; RV32IM-NEXT: addi t5, t6, -179 +; RV32IM-NEXT: mul t6, a4, t3 +; RV32IM-NEXT: mulhu s3, a4, s1 +; RV32IM-NEXT: mul s4, a3, s1 +; RV32IM-NEXT: mulhu s5, a4, t3 +; RV32IM-NEXT: srli t0, a5, 1 +; RV32IM-NEXT: slli t1, a2, 31 +; RV32IM-NEXT: srli s6, a2, 1 +; RV32IM-NEXT: or s7, t1, t0 +; RV32IM-NEXT: mul s8, s6, t4 +; RV32IM-NEXT: mulhu s9, s6, t4 +; RV32IM-NEXT: mul t1, s6, s2 +; RV32IM-NEXT: mulhu t0, s6, s2 +; RV32IM-NEXT: mul s6, s7, s2 +; RV32IM-NEXT: mulhu t4, s7, t4 +; RV32IM-NEXT: mulhu s2, s7, s2 +; RV32IM-NEXT: mul s7, a6, t2 +; RV32IM-NEXT: add t6, s3, t6 +; RV32IM-NEXT: add s4, t6, s4 +; RV32IM-NEXT: sltu s3, t6, s3 +; RV32IM-NEXT: sltu t6, s4, t6 +; RV32IM-NEXT: mulhu s4, a6, t5 +; RV32IM-NEXT: add s3, s5, s3 +; RV32IM-NEXT: mul s5, a7, t5 +; RV32IM-NEXT: add s7, s4, s7 +; RV32IM-NEXT: add s5, s7, s5 +; RV32IM-NEXT: sltu s4, s7, s4 +; RV32IM-NEXT: sltu s5, s5, s7 +; RV32IM-NEXT: mulhu s7, a6, t2 +; RV32IM-NEXT: add s4, s7, s4 +; RV32IM-NEXT: add s6, t4, s6 +; RV32IM-NEXT: add s8, s6, s8 +; RV32IM-NEXT: sltu t4, s6, t4 +; RV32IM-NEXT: sltu s6, s8, s6 +; RV32IM-NEXT: mulhu s1, a3, s1 +; RV32IM-NEXT: mulhu t5, a7, t5 +; RV32IM-NEXT: add s1, s3, s1 +; RV32IM-NEXT: add t5, s4, t5 +; RV32IM-NEXT: sltu s3, s1, s3 +; RV32IM-NEXT: add s1, s1, t6 +; RV32IM-NEXT: add t4, s2, t4 +; RV32IM-NEXT: add s2, t5, s5 +; RV32IM-NEXT: sltu t5, t5, s4 +; RV32IM-NEXT: seqz s4, s1 +; RV32IM-NEXT: and t6, s4, t6 +; RV32IM-NEXT: seqz s4, s2 +; RV32IM-NEXT: and s4, s4, s5 +; RV32IM-NEXT: or t6, s3, t6 +; RV32IM-NEXT: mul s3, a3, t3 +; RV32IM-NEXT: mulhu s5, a3, t3 +; RV32IM-NEXT: add t3, s1, s3 +; RV32IM-NEXT: sltu s1, t3, s1 +; RV32IM-NEXT: add s1, s1, s5 +; RV32IM-NEXT: or t5, t5, s4 +; RV32IM-NEXT: mul s3, a7, t2 +; RV32IM-NEXT: mulhu t2, a7, t2 +; RV32IM-NEXT: add s3, s2, s3 +; RV32IM-NEXT: sltu s2, s3, s2 +; RV32IM-NEXT: add t2, s2, t2 +; RV32IM-NEXT: add s9, t4, s9 +; RV32IM-NEXT: sltu t4, s9, t4 +; RV32IM-NEXT: add s9, s9, s6 +; RV32IM-NEXT: add t6, s1, t6 +; RV32IM-NEXT: seqz s1, s9 +; RV32IM-NEXT: and s1, s1, s6 +; RV32IM-NEXT: add t2, t2, t5 +; RV32IM-NEXT: or t4, t4, s1 +; RV32IM-NEXT: sub t5, a4, t3 +; RV32IM-NEXT: srli s1, s3, 12 +; RV32IM-NEXT: add t1, s9, t1 +; RV32IM-NEXT: sltu s2, t1, s9 +; RV32IM-NEXT: add t0, s2, t0 +; RV32IM-NEXT: sltu s2, a4, t5 +; RV32IM-NEXT: srli t5, t5, 1 +; RV32IM-NEXT: sub s2, a3, s2 +; RV32IM-NEXT: sub s2, s2, t6 +; RV32IM-NEXT: add t0, t0, t4 +; RV32IM-NEXT: slli t4, t2, 20 +; RV32IM-NEXT: or t4, t4, s1 +; RV32IM-NEXT: slli s1, s2, 31 +; RV32IM-NEXT: or t5, s1, t5 +; RV32IM-NEXT: lui s1, 1 +; RV32IM-NEXT: addi s1, s1, 1327 +; RV32IM-NEXT: srli t1, t1, 7 +; RV32IM-NEXT: srli t2, t2, 12 +; RV32IM-NEXT: srli s2, s2, 1 +; RV32IM-NEXT: mul t2, t2, s1 +; RV32IM-NEXT: add t6, s2, t6 +; RV32IM-NEXT: mulhu s2, t4, s1 +; RV32IM-NEXT: mul t4, t4, s1 +; RV32IM-NEXT: slli s1, t0, 25 +; RV32IM-NEXT: or t1, s1, t1 +; RV32IM-NEXT: li s1, 654 +; RV32IM-NEXT: srli t0, t0, 7 +; RV32IM-NEXT: mul t0, t0, s1 +; RV32IM-NEXT: add t2, s2, t2 +; RV32IM-NEXT: mulhu s2, t1, s1 +; RV32IM-NEXT: mul t1, t1, s1 +; RV32IM-NEXT: sub a7, a7, t2 +; RV32IM-NEXT: add t3, t5, t3 +; RV32IM-NEXT: sltu t2, t3, t5 +; RV32IM-NEXT: add t2, t6, t2 +; RV32IM-NEXT: li t5, 23 +; RV32IM-NEXT: sub s1, a6, t4 +; RV32IM-NEXT: srli t3, t3, 4 +; RV32IM-NEXT: sltu a6, a6, s1 +; RV32IM-NEXT: add t0, s2, t0 +; RV32IM-NEXT: sub s2, a5, t1 +; RV32IM-NEXT: srli t1, t2, 4 +; RV32IM-NEXT: slli t2, t2, 28 +; RV32IM-NEXT: sltu a5, a5, s2 +; RV32IM-NEXT: sub a2, a2, t0 +; RV32IM-NEXT: mul t0, t1, t5 +; RV32IM-NEXT: or t1, t2, t3 +; RV32IM-NEXT: sub s3, a2, a5 +; RV32IM-NEXT: mulhu a2, t1, t5 +; RV32IM-NEXT: mul a5, t1, t5 +; RV32IM-NEXT: add a2, a2, t0 +; RV32IM-NEXT: sub s4, a4, a5 +; RV32IM-NEXT: sltu a4, a4, s4 +; RV32IM-NEXT: sub a3, a3, a2 +; RV32IM-NEXT: sub s5, a3, a4 +; RV32IM-NEXT: sub s6, a7, a6 ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a1, a4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s5 -; RV32IM-NEXT: mv a1, s6 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s5, a0 -; RV32IM-NEXT: mv s6, a1 -; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s1 -; RV32IM-NEXT: mv a1, s2 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s1, a0 -; RV32IM-NEXT: mv s2, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s3 -; RV32IM-NEXT: mv a1, s4 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: sw s1, 16(s0) -; RV32IM-NEXT: sw s2, 20(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s5, 8(s0) -; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s4, 16(s0) +; RV32IM-NEXT: sw s5, 20(s0) +; RV32IM-NEXT: sw s1, 24(s0) +; RV32IM-NEXT: sw s6, 28(s0) +; RV32IM-NEXT: sw a0, 0(s0) +; RV32IM-NEXT: sw a1, 4(s0) +; RV32IM-NEXT: sw s2, 8(s0) +; RV32IM-NEXT: sw s3, 12(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -916,6 +1028,7 @@ define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s8, 8(sp) # 4-byte Folded Reload +; RV32IM-NEXT: lw s9, 4(sp) # 4-byte Folded Reload ; RV32IM-NEXT: addi sp, sp, 48 ; RV32IM-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll index 14bcc22880697..d73c33ea506b0 100644 --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -294,19 +294,77 @@ entry: define i64 @PR23590(i64 %x) nounwind { ; X86-LABEL: PR23590: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12345 # imm = 0x3039 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $7 -; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: shrdl $12, %ebx, %edi +; X86-NEXT: movl $12345, %edx # imm = 0x3039 +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $12, %ebx +; X86-NEXT: imull $12345, %ebx, %edi # imm = 0x3039 +; X86-NEXT: addl %edx, %edi +; X86-NEXT: subl %eax, %esi +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: subl %edi, %ecx +; X86-NEXT: movl $613566756, %ebx # imm = 0x24924924 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1840700269, %ecx # imm = 0x92492493 +; X86-NEXT: mull %ecx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-FAST-LABEL: PR23590: @@ -347,27 +405,43 @@ define { i64, i32 } @PR38622(i64) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: shrdl $11, %edi, %ebx +; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: shrl $11, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695 +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: shrdl $9, %ebx, %esi +; X86-NEXT: imull $-294967296, %esi, %eax # imm = 0xEE6B2800 +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: shrl $9, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1165,13 +1239,41 @@ entry: define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind { ; X86-LABEL: udiv_i64_magic_large_postshift: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl $-1431655765, %esi # imm = 0xAAAAAAAB +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: shrl $31, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: udiv_i64_magic_large_postshift: @@ -1190,13 +1292,44 @@ define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind { define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind { ; X86-LABEL: urem_i64_magic_large_postshift: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $-1431655766, %ebx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: shrl %ebx +; X86-NEXT: andl $1073741824, %ebx # imm = 0x40000000 +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: urem_i64_magic_large_postshift: @@ -1217,13 +1350,24 @@ define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind { define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind { ; X86-LABEL: udiv_i64_magic_large_preshift: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $14 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: udiv_i64_magic_large_preshift: @@ -1242,13 +1386,37 @@ define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind { define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind { ; X86-LABEL: urem_i64_magic_large_preshift: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $14 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: shrl %ebx +; X86-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: shll $4, %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: urem_i64_magic_large_preshift: @@ -1270,13 +1438,56 @@ define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind { define i64 @udiv_i64_magic_is_add(i64 %x) nounwind { ; X86-LABEL: udiv_i64_magic_is_add: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $196608 # imm = 0x30000 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7 +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: leal (%edx,%ebx), %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: shrdl $1, %ecx, %esi +; X86-NEXT: shrl %ecx +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: shrl $17, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: udiv_i64_magic_is_add: @@ -1297,13 +1508,64 @@ define i64 @udiv_i64_magic_is_add(i64 %x) nounwind { define i64 @urem_i64_magic_is_add(i64 %x) nounwind { ; X86-LABEL: urem_i64_magic_is_add: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $196608 # imm = 0x30000 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $1431626638, %ebx # imm = 0x5554E38E +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7 +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: leal (%edx,%ebx), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sbbl $0, %ebp +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: shrdl $1, %ebp, %ecx +; X86-NEXT: shrl %ebp +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: shrl $17, %ebp +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: shll $16, %ebp +; X86-NEXT: leal (%ebp,%ebp,2), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: urem_i64_magic_is_add: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll index 9d54452404fb0..39c597b2d60e3 100644 --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -67,25 +67,93 @@ define i64 @div128(i128 %x) nounwind { define i64 @umod128(i128 %x) nounwind { ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $11, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movabsq $8384883669867978007, %r10 # imm = 0x745D1745D1745D17 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movabsq $5030930201920786805, %r11 # imm = 0x45D1745D1745D175 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: addq %rcx, %r9 +; X86-64-NEXT: adcq $0, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movq %rax, %r10 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: addq %r9, %rax +; X86-64-NEXT: adcq %r8, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: addq %r10, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: subq %rdx, %rax +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: subq %rcx, %rsi +; X86-64-NEXT: movl %esi, %r8d +; X86-64-NEXT: shrl %r8d +; X86-64-NEXT: shldq $63, %rax, %rsi +; X86-64-NEXT: xorl %eax, %eax +; X86-64-NEXT: addq %rdx, %rsi +; X86-64-NEXT: setb %al +; X86-64-NEXT: addl %r8d, %ecx +; X86-64-NEXT: addl %eax, %ecx +; X86-64-NEXT: shldq $61, %rsi, %rcx +; X86-64-NEXT: leaq (%rcx,%rcx,4), %rax +; X86-64-NEXT: leaq (%rcx,%rax,2), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: umod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movabsq $8384883669867978007, %rsi # imm = 0x745D1745D1745D17 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rax, %r9 +; WIN64-NEXT: movq %rdx, %r10 +; WIN64-NEXT: movabsq $5030930201920786805, %rdi # imm = 0x45D1745D1745D175 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: addq %r9, %r11 +; WIN64-NEXT: adcq $0, %r10 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %rax, %rsi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: addq %r11, %rax +; WIN64-NEXT: adcq %r10, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: addq %rsi, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: subq %rdx, %rax +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: subq %r9, %r8 +; WIN64-NEXT: movl %r8d, %r10d +; WIN64-NEXT: shrl %r10d +; WIN64-NEXT: shldq $63, %rax, %r8 +; WIN64-NEXT: xorl %eax, %eax +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: setb %al +; WIN64-NEXT: addl %r10d, %r9d +; WIN64-NEXT: addl %eax, %r9d +; WIN64-NEXT: shldq $61, %r8, %r9 +; WIN64-NEXT: leaq (%r9,%r9,4), %rax +; WIN64-NEXT: leaq (%r9,%rax,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -1018,27 +1086,70 @@ entry: define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind { ; X86-64-LABEL: udiv_magic_preshift_and_postshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $22, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: shrdq $1, %rsi, %rdi +; X86-64-NEXT: movabsq $-5030930201920786805, %r9 # imm = 0xBA2E8BA2E8BA2E8B +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movabsq $-6707906935894382405, %r10 # imm = 0xA2E8BA2E8BA2E8BB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: shrq %rsi +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rax, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %rdx, %rcx +; X86-64-NEXT: adcq $0, %r8 +; X86-64-NEXT: addq %r9, %rcx +; X86-64-NEXT: adcq $0, %r8 +; X86-64-NEXT: shrdq $3, %r8, %rcx +; X86-64-NEXT: shrq $3, %r8 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: movq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_magic_preshift_and_postshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: shrdq $1, %rdx, %r9 +; WIN64-NEXT: movabsq $-5030930201920786805, %r11 # imm = 0xBA2E8BA2E8BA2E8B +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %rdx, %rcx +; WIN64-NEXT: movabsq $-6707906935894382405, %rsi # imm = 0xA2E8BA2E8BA2E8BB +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %r10, %r9 +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: shrq %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: movq %rdx, %r10 +; WIN64-NEXT: movq %rax, %r11 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %r9, %rax +; WIN64-NEXT: adcq %rdx, %rcx +; WIN64-NEXT: adcq $0, %r10 +; WIN64-NEXT: addq %r11, %rcx +; WIN64-NEXT: adcq $0, %r10 +; WIN64-NEXT: shrdq $3, %r10, %rcx +; WIN64-NEXT: shrq $3, %r10 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r10, %rdx +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq %ret = udiv i128 %x, 22 ret i128 %ret @@ -1048,27 +1159,99 @@ define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind { define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind { ; X86-64-LABEL: urem_magic_preshift_and_postshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax +; X86-64-NEXT: pushq %rbx +; X86-64-NEXT: movq %rdi, %r8 +; X86-64-NEXT: shrdq $1, %rsi, %r8 +; X86-64-NEXT: movabsq $-5030930201920786805, %r11 # imm = 0xBA2E8BA2E8BA2E8B +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: movq %rax, %r9 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movabsq $-6707906935894382405, %rbx # imm = 0xA2E8BA2E8BA2E8BB +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %rbx +; X86-64-NEXT: movq %rdx, %r10 +; X86-64-NEXT: addq %r9, %r10 +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movq %rsi, %r9 +; X86-64-NEXT: shrq %r9 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rax, %r11 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %rbx +; X86-64-NEXT: addq %r10, %rax +; X86-64-NEXT: adcq %rdx, %rcx +; X86-64-NEXT: adcq $0, %r8 +; X86-64-NEXT: addq %r11, %rcx +; X86-64-NEXT: adcq $0, %r8 +; X86-64-NEXT: shrdq $3, %r8, %rcx +; X86-64-NEXT: shrq $3, %r8 +; X86-64-NEXT: leaq (%r8,%r8,4), %rax +; X86-64-NEXT: leaq (%r8,%rax,4), %r9 ; X86-64-NEXT: movl $22, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: addq %r9, %rdx +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: subq %rdx, %rsi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: movq %rsi, %rdx +; X86-64-NEXT: popq %rbx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_magic_preshift_and_postshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: shrdq $1, %rdx, %r10 +; WIN64-NEXT: movabsq $-5030930201920786805, %rdi # imm = 0xBA2E8BA2E8BA2E8B +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rax, %r11 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movabsq $-6707906935894382405, %rbx # imm = 0xA2E8BA2E8BA2E8BB +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: movq %rdx, %rsi +; WIN64-NEXT: addq %r11, %rsi +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movq %r8, %r11 +; WIN64-NEXT: shrq %r11 +; WIN64-NEXT: movq %r11, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r10 +; WIN64-NEXT: movq %rax, %rdi +; WIN64-NEXT: movq %r11, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: addq %rsi, %rax +; WIN64-NEXT: adcq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r10 +; WIN64-NEXT: addq %rdi, %r9 +; WIN64-NEXT: adcq $0, %r10 +; WIN64-NEXT: shrdq $3, %r10, %r9 +; WIN64-NEXT: shrq $3, %r10 +; WIN64-NEXT: leaq (%r10,%r10,4), %rax +; WIN64-NEXT: leaq (%r10,%rax,4), %r11 +; WIN64-NEXT: movl $22, %edx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: addq %r11, %rdx +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: subq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r8, %rdx +; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq %ret = urem i128 %x, 22 ret i128 %ret @@ -1078,28 +1261,37 @@ define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind { define i128 @udiv_magic_large_preshift(i128 %x) nounwind { ; X86-64-LABEL: udiv_magic_large_preshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000 +; X86-64-NEXT: shrq $36, %rsi +; X86-64-NEXT: movabsq $1676976733973595601, %rcx # imm = 0x1745D1745D1745D1 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: xorl %edx, %edx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_magic_large_preshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000 -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %rcx +; WIN64-NEXT: shrq $36, %rcx +; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rax, %r8 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: addq %r8, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq %ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100 ret i128 %ret @@ -1109,28 +1301,45 @@ define i128 @udiv_magic_large_preshift(i128 %x) nounwind { define i128 @urem_magic_large_preshift(i128 %x) nounwind { ; X86-64-LABEL: urem_magic_large_preshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000 -; X86-64-NEXT: xorl %edx, %edx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %rcx +; X86-64-NEXT: shrq $36, %rcx +; X86-64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: adcq $0, %r9 +; X86-64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000 +; X86-64-NEXT: imulq %r9, %rax +; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: movq %rsi, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_magic_large_preshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: shrq $36, %r9 +; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1 +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18 +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: adcq $0, %r11 ; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000 -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: imulq %r11, %rax +; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r8, %rdx ; WIN64-NEXT: retq %ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100 ret i128 %ret @@ -1140,27 +1349,39 @@ define i128 @urem_magic_large_preshift(i128 %x) nounwind { define i128 @udiv_magic_large_postshift(i128 %x) nounwind { ; X86-64-LABEL: udiv_magic_large_postshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $1, %edx -; X86-64-NEXT: movl $1, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq $-1, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_magic_large_postshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq $-1, %r11 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: adcq $0, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq %ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1 ret i128 %ret @@ -1170,27 +1391,45 @@ define i128 @udiv_magic_large_postshift(i128 %x) nounwind { define i128 @urem_magic_large_postshift(i128 %x) nounwind { ; X86-64-LABEL: urem_magic_large_postshift: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $1, %edx -; X86-64-NEXT: movl $1, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq $-1, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: subq %rcx, %rsi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: movq %rsi, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_magic_large_postshift: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq $-1, %r11 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r11 +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: adcq $0, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: subq %r9, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: subq %r9, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r8, %rdx ; WIN64-NEXT: retq %ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1 ret i128 %ret @@ -1200,28 +1439,85 @@ define i128 @urem_magic_large_postshift(i128 %x) nounwind { define i128 @udiv_magic_is_add(i128 %x) nounwind { ; X86-64-LABEL: udiv_magic_is_add: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X86-64-NEXT: movl $1, %edx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq $-1, %r8 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq $-3, %r11 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: movq %rdx, %r10 +; X86-64-NEXT: addq %rcx, %r10 +; X86-64-NEXT: adcq $0, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: addq %r10, %rax +; X86-64-NEXT: adcq %r9, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: leaq (%rdx,%r8), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: addq %r8, %rax +; X86-64-NEXT: sbbq %rcx, %rsi +; X86-64-NEXT: shrdq $1, %rsi, %rdi +; X86-64-NEXT: shrq %rsi +; X86-64-NEXT: addq %rax, %rdi +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: adcq %rsi, %rcx +; X86-64-NEXT: shrq $63, %rcx +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_magic_is_add: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq $-1, %r10 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rax, %r9 +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq $-3, %rdi +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %rsi +; WIN64-NEXT: addq %r9, %rsi +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: addq %rsi, %rax +; WIN64-NEXT: adcq %r11, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: leaq (%rdx,%r10), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: addq %r10, %rax +; WIN64-NEXT: sbbq %r9, %r8 +; WIN64-NEXT: shrdq $1, %r8, %rcx +; WIN64-NEXT: shrq %r8 +; WIN64-NEXT: addq %rax, %rcx +; WIN64-NEXT: adcq $0, %r8 +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: adcq %r9, %r8 +; WIN64-NEXT: shrq $63, %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: xorl %edx, %edx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq %ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1 ret i128 %ret @@ -1231,28 +1527,99 @@ define i128 @udiv_magic_is_add(i128 %x) nounwind { define i128 @urem_magic_is_add(i128 %x) nounwind { ; X86-64-LABEL: urem_magic_is_add: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movq $-1, %r8 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq $-3, %r11 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: movq %rdx, %r10 +; X86-64-NEXT: addq %rcx, %r10 +; X86-64-NEXT: adcq $0, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rdx, %rcx +; X86-64-NEXT: movq %rax, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r11 +; X86-64-NEXT: addq %r10, %rax +; X86-64-NEXT: adcq %r9, %rdx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: leaq (%rdx,%r8), %rax +; X86-64-NEXT: movq %rdi, %r9 +; X86-64-NEXT: subq %rax, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: sbbq $0, %rax +; X86-64-NEXT: movq %rdx, %r10 +; X86-64-NEXT: addq %r8, %r10 +; X86-64-NEXT: sbbq %rcx, %rax +; X86-64-NEXT: shrdq $1, %rax, %r9 +; X86-64-NEXT: shrq %rax +; X86-64-NEXT: addq %r10, %r9 +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: adcq %rcx, %rax ; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X86-64-NEXT: movl $1, %edx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: andq %rax, %rcx +; X86-64-NEXT: shrq $63, %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: subq %rcx, %rsi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: movq %rsi, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_magic_is_add: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq $-1, %r10 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rax, %r9 +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq $-3, %rdi +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %rsi +; WIN64-NEXT: addq %r9, %rsi +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %rax, %r10 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: addq %rsi, %rax +; WIN64-NEXT: adcq %r11, %rdx +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: leaq (%rdx,%r10), %rax +; WIN64-NEXT: movq %rcx, %r11 +; WIN64-NEXT: subq %rax, %r11 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: sbbq $0, %rax +; WIN64-NEXT: movq %rdx, %rsi +; WIN64-NEXT: addq %r10, %rsi +; WIN64-NEXT: sbbq %r9, %rax +; WIN64-NEXT: shrdq $1, %rax, %r11 +; WIN64-NEXT: shrq %rax +; WIN64-NEXT: addq %rsi, %r11 +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: addq %r10, %rdx +; WIN64-NEXT: adcq %r9, %rax +; WIN64-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 +; WIN64-NEXT: andq %rax, %rdx +; WIN64-NEXT: shrq $63, %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: subq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r8, %rdx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq %ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1 ret i128 %ret diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index df97f49440f74..a4b06d6af19bf 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -152,40 +152,57 @@ declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X86-SSE2-LABEL: fshl_i37: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: andl $31, %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE2-NEXT: andl $31, %esi +; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45 +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: mull %edx +; X86-SSE2-NEXT: movl %eax, %ebx +; X86-SSE2-NEXT: movl %edx, %edi +; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5 +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: mull %edx +; X86-SSE2-NEXT: movl %edx, %ebp +; X86-SSE2-NEXT: addl %ebx, %ebp +; X86-SSE2-NEXT: adcl $0, %edi +; X86-SSE2-NEXT: movl %esi, %eax +; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5 +; X86-SSE2-NEXT: mull %edx +; X86-SSE2-NEXT: addl %ebp, %eax +; X86-SSE2-NEXT: adcl %edi, %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: imull $116080197, %esi, %esi # imm = 0x6EB3E45 +; X86-SSE2-NEXT: addl %edx, %esi +; X86-SSE2-NEXT: leal (%esi,%esi,8), %edx +; X86-SSE2-NEXT: leal (%esi,%edx,4), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: shldl $27, %ebx, %edi -; X86-SSE2-NEXT: pushl $0 -; X86-SSE2-NEXT: pushl $37 -; X86-SSE2-NEXT: pushl %eax -; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: calll __umoddi3 -; X86-SSE2-NEXT: addl $16, %esp -; X86-SSE2-NEXT: movl %eax, %ecx +; X86-SSE2-NEXT: subl %edx, %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: shldl $27, %edi, %esi ; X86-SSE2-NEXT: testb $32, %cl ; X86-SSE2-NEXT: jne .LBB3_1 ; X86-SSE2-NEXT: # %bb.2: -; X86-SSE2-NEXT: movl %edi, %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl %esi, %edi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: movl %eax, %esi ; X86-SSE2-NEXT: jmp .LBB3_3 ; X86-SSE2-NEXT: .LBB3_1: -; X86-SSE2-NEXT: shll $27, %ebx +; X86-SSE2-NEXT: shll $27, %edi +; X86-SSE2-NEXT: movl %eax, %edx ; X86-SSE2-NEXT: .LBB3_3: -; X86-SSE2-NEXT: movl %edi, %eax -; X86-SSE2-NEXT: shldl %cl, %ebx, %eax +; X86-SSE2-NEXT: movl %esi, %eax +; X86-SSE2-NEXT: shldl %cl, %edi, %eax ; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SSE2-NEXT: shldl %cl, %edi, %esi -; X86-SSE2-NEXT: movl %esi, %edx +; X86-SSE2-NEXT: shldl %cl, %esi, %edx ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X64-AVX-LABEL: fshl_i37: @@ -318,41 +335,58 @@ declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X86-SSE2-LABEL: fshr_i37: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: andl $31, %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE2-NEXT: shldl $27, %ebx, %esi -; X86-SSE2-NEXT: pushl $0 -; X86-SSE2-NEXT: pushl $37 -; X86-SSE2-NEXT: pushl %eax -; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: calll __umoddi3 -; X86-SSE2-NEXT: addl $16, %esp -; X86-SSE2-NEXT: movl %eax, %ecx -; X86-SSE2-NEXT: addl $27, %ecx +; X86-SSE2-NEXT: andl $31, %esi +; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45 +; X86-SSE2-NEXT: movl %ebp, %eax +; X86-SSE2-NEXT: mull %edx +; X86-SSE2-NEXT: movl %eax, %ebx +; X86-SSE2-NEXT: movl %edx, %edi +; X86-SSE2-NEXT: movl $812561381, %ecx # imm = 0x306EB3E5 +; X86-SSE2-NEXT: movl %ebp, %eax +; X86-SSE2-NEXT: mull %ecx +; X86-SSE2-NEXT: movl %edx, %ebp +; X86-SSE2-NEXT: addl %ebx, %ebp +; X86-SSE2-NEXT: adcl $0, %edi +; X86-SSE2-NEXT: movl %esi, %eax +; X86-SSE2-NEXT: mull %ecx +; X86-SSE2-NEXT: movl %edx, %ebx +; X86-SSE2-NEXT: addl %ebp, %eax +; X86-SSE2-NEXT: adcl %edi, %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: imull $116080197, %esi, %eax # imm = 0x6EB3E45 +; X86-SSE2-NEXT: addl %ebx, %eax +; X86-SSE2-NEXT: leal (%eax,%eax,8), %ecx +; X86-SSE2-NEXT: leal (%eax,%ecx,4), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE2-NEXT: negl %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: leal 27(%ecx,%eax), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: shldl $27, %edi, %eax ; X86-SSE2-NEXT: testb $32, %cl ; X86-SSE2-NEXT: je .LBB10_1 ; X86-SSE2-NEXT: # %bb.2: -; X86-SSE2-NEXT: movl %edi, %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE2-NEXT: jmp .LBB10_3 ; X86-SSE2-NEXT: .LBB10_1: -; X86-SSE2-NEXT: shll $27, %ebx -; X86-SSE2-NEXT: movl %esi, %edx -; X86-SSE2-NEXT: movl %ebx, %esi +; X86-SSE2-NEXT: shll $27, %edi +; X86-SSE2-NEXT: movl %edx, %esi +; X86-SSE2-NEXT: movl %eax, %edx +; X86-SSE2-NEXT: movl %edi, %eax ; X86-SSE2-NEXT: .LBB10_3: -; X86-SSE2-NEXT: shrdl %cl, %edx, %esi +; X86-SSE2-NEXT: shrdl %cl, %edx, %eax ; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SSE2-NEXT: shrdl %cl, %edi, %edx -; X86-SSE2-NEXT: movl %esi, %eax +; X86-SSE2-NEXT: shrdl %cl, %esi, %edx ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X64-AVX-LABEL: fshr_i37: diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll index 9011832421326..a59861de08fdb 100644 --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -42,11 +42,19 @@ define i128 @test2(i128 %x) nounwind { ; ; X64-LABEL: test2: ; X64: # %bb.0: -; X64-NEXT: pushq %rax +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movl $4, %ecx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movl $17, %edx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rdx +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: movq $-4, %rcx -; X64-NEXT: callq __udivti3@PLT -; X64-NEXT: popq %rcx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206464 ret i128 %tmp @@ -59,11 +67,31 @@ define i128 @test3(i128 %x) nounwind { ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movq $-3, %rdx -; X64-NEXT: movq $-5, %rcx -; X64-NEXT: callq __udivti3@PLT -; X64-NEXT: popq %rcx +; X64-NEXT: movabsq $4611686018427387905, %r9 # imm = 0x4000000000000001 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movl $5, %r10d +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %r8, %rdx +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: addq %r9, %rdx +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206467 ret i128 %tmp