diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index bcda188d4c2cb..772e48efb8607 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1574,7 +1574,7 @@ def ProcessorFeatures { FeatureVPCLMULQDQ]; list ZN3AdditionalTuning = [TuningMacroFusion]; list ZN3Tuning = - !listconcat(ZN2Tuning, ZN3AdditionalTuning); + !listremove(!listconcat(ZN2Tuning, ZN3AdditionalTuning), [TuningSlowSHLD]); list ZN3Features = !listconcat(ZN2Features, ZN3AdditionalFeatures); diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll index c7da04171e6a1..756019d0e98a0 100644 --- a/llvm/test/CodeGen/X86/shift-i512.ll +++ b/llvm/test/CodeGen/X86/shift-i512.ll @@ -48,46 +48,20 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; ; ZNVER4-LABEL: shl_i512_1: ; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; ZNVER4-NEXT: vmovq %xmm0, %rdx -; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9 -; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax -; ZNVER4-NEXT: vmovq %xmm1, %rcx ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; ZNVER4-NEXT: shrq $63, %rdx -; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi -; ZNVER4-NEXT: vmovq %xmm1, %rdi -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx -; ZNVER4-NEXT: shrq $63, %r9 -; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0 -; ZNVER4-NEXT: vmovq %xmm1, %r10 -; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8 -; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9 -; ZNVER4-NEXT: shrq $63, %r10 -; ZNVER4-NEXT: vmovq %rdx, %xmm4 -; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10 -; ZNVER4-NEXT: shrq $63, %r8 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8 -; ZNVER4-NEXT: shrq $63, %rdi -; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi -; ZNVER4-NEXT: shrq $63, %rsi -; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi -; ZNVER4-NEXT: shrq $63, %rcx -; ZNVER4-NEXT: vmovq %r8, %xmm3 -; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax -; ZNVER4-NEXT: vmovq %rsi, %xmm2 -; ZNVER4-NEXT: vmovq %rax, %xmm1 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ZNVER4-NEXT: vmovq %rdi, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; ZNVER4-NEXT: vmovq %r10, %xmm3 +; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2 +; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4 ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vmovq %r9, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 +; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1 +; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0 +; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] ; ZNVER4-NEXT: retq %d = bitcast <8 x i64> %a to i512 %s = shl i512 %d, 1 @@ -142,65 +116,21 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) { ; ; ZNVER4-LABEL: lshr_i512_1: ; ZNVER4: # %bb.0: -; ZNVER4-NEXT: pushq %rbx -; ZNVER4-NEXT: .cfi_def_cfa_offset 16 -; ZNVER4-NEXT: .cfi_offset %rbx, -16 +; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vmovq %xmm0, %r10 -; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi -; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx -; ZNVER4-NEXT: vmovq %xmm1, %r9 -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; ZNVER4-NEXT: shrq %r10 -; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax -; ZNVER4-NEXT: vmovq %xmm0, %rdx -; ZNVER4-NEXT: vmovq %xmm1, %rdi -; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; ZNVER4-NEXT: movq %rdx, %r8 -; ZNVER4-NEXT: shrq %r8 -; ZNVER4-NEXT: shlq $63, %rax -; ZNVER4-NEXT: movq %rdi, %rbx -; ZNVER4-NEXT: shrq %rbx -; ZNVER4-NEXT: shlq $63, %rdx -; ZNVER4-NEXT: shlq $63, %rdi -; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0 -; ZNVER4-NEXT: orq %r8, %rax -; ZNVER4-NEXT: movq %r11, %r8 -; ZNVER4-NEXT: shlq $63, %r8 -; ZNVER4-NEXT: shrq %r11 -; ZNVER4-NEXT: orq %rbx, %r8 -; ZNVER4-NEXT: movq %r9, %rbx -; ZNVER4-NEXT: orq %r11, %rdx -; ZNVER4-NEXT: movq %rsi, %r11 -; ZNVER4-NEXT: shrq %r11 -; ZNVER4-NEXT: shlq $63, %rbx -; ZNVER4-NEXT: shrq %r9 -; ZNVER4-NEXT: shlq $63, %rsi -; ZNVER4-NEXT: vmovq %rax, %xmm4 -; ZNVER4-NEXT: orq %r11, %rbx -; ZNVER4-NEXT: movq %rcx, %r11 -; ZNVER4-NEXT: shlq $63, %r11 -; ZNVER4-NEXT: shrq %rcx -; ZNVER4-NEXT: orq %r10, %rsi -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; ZNVER4-NEXT: orq %r9, %r11 -; ZNVER4-NEXT: orq %rdi, %rcx -; ZNVER4-NEXT: vmovq %rbx, %xmm3 -; ZNVER4-NEXT: vmovq %rcx, %xmm1 -; ZNVER4-NEXT: vmovq %r11, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ZNVER4-NEXT: vmovq %rsi, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ZNVER4-NEXT: vmovq %r8, %xmm3 -; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vmovq %rdx, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; ZNVER4-NEXT: popq %rbx -; ZNVER4-NEXT: .cfi_def_cfa_offset 8 +; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 +; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 +; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2 +; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 +; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ZNVER4-NEXT: retq %d = bitcast <8 x i64> %a to i512 %s = lshr i512 %d, 1 @@ -255,65 +185,21 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) { ; ; ZNVER4-LABEL: ashr_i512_1: ; ZNVER4: # %bb.0: -; ZNVER4-NEXT: pushq %rbx -; ZNVER4-NEXT: .cfi_def_cfa_offset 16 -; ZNVER4-NEXT: .cfi_offset %rbx, -16 +; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vmovq %xmm0, %r10 -; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi -; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx -; ZNVER4-NEXT: vmovq %xmm1, %r9 -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; ZNVER4-NEXT: shrq %r10 -; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax -; ZNVER4-NEXT: vmovq %xmm0, %rdx -; ZNVER4-NEXT: vmovq %xmm1, %rdi -; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; ZNVER4-NEXT: movq %rdx, %r8 -; ZNVER4-NEXT: shrq %r8 -; ZNVER4-NEXT: shlq $63, %rax -; ZNVER4-NEXT: movq %rdi, %rbx -; ZNVER4-NEXT: shrq %rbx -; ZNVER4-NEXT: shlq $63, %rdx -; ZNVER4-NEXT: shlq $63, %rdi -; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0 -; ZNVER4-NEXT: orq %r8, %rax -; ZNVER4-NEXT: movq %r11, %r8 -; ZNVER4-NEXT: shlq $63, %r8 -; ZNVER4-NEXT: shrq %r11 -; ZNVER4-NEXT: orq %rbx, %r8 -; ZNVER4-NEXT: movq %r9, %rbx -; ZNVER4-NEXT: orq %r11, %rdx -; ZNVER4-NEXT: movq %rsi, %r11 -; ZNVER4-NEXT: shrq %r11 -; ZNVER4-NEXT: shlq $63, %rbx -; ZNVER4-NEXT: shrq %r9 -; ZNVER4-NEXT: shlq $63, %rsi -; ZNVER4-NEXT: vmovq %rax, %xmm4 -; ZNVER4-NEXT: orq %r11, %rbx -; ZNVER4-NEXT: movq %rcx, %r11 -; ZNVER4-NEXT: shlq $63, %r11 -; ZNVER4-NEXT: shrq %rcx -; ZNVER4-NEXT: orq %r10, %rsi -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; ZNVER4-NEXT: orq %r9, %r11 -; ZNVER4-NEXT: orq %rdi, %rcx -; ZNVER4-NEXT: vmovq %rbx, %xmm3 -; ZNVER4-NEXT: vmovq %rcx, %xmm1 -; ZNVER4-NEXT: vmovq %r11, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ZNVER4-NEXT: vmovq %rsi, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ZNVER4-NEXT: vmovq %r8, %xmm3 -; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vmovq %rdx, %xmm2 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; ZNVER4-NEXT: popq %rbx -; ZNVER4-NEXT: .cfi_def_cfa_offset 8 +; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 +; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 +; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2 +; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 +; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ZNVER4-NEXT: retq %d = bitcast <8 x i64> %a to i512 %s = ashr i512 %d, 1 diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll index 58f6a66aeff79..c5e879c0135f4 100644 --- a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll @@ -12,12 +12,12 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=BMI ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=BMI ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=BMI -; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2-FAST +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2-FAST +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2-FAST ; Verify that for the X86_64 processors that are known to have poor latency ; double precision shift instructions we do not generate 'shld' or 'shrd' @@ -53,15 +53,23 @@ define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone { ; BMI-NEXT: orq %rdi, %rax ; BMI-NEXT: retq ; -; BMI2-LABEL: lshift: -; BMI2: # %bb.0: # %entry -; BMI2-NEXT: # kill: def $edx killed $edx def $rdx -; BMI2-NEXT: shlxq %rdx, %rdi, %rcx -; BMI2-NEXT: notb %dl -; BMI2-NEXT: shrq %rsi -; BMI2-NEXT: shrxq %rdx, %rsi, %rax -; BMI2-NEXT: orq %rcx, %rax -; BMI2-NEXT: retq +; BMI2-SLOW-LABEL: lshift: +; BMI2-SLOW: # %bb.0: # %entry +; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx +; BMI2-SLOW-NEXT: shlxq %rdx, %rdi, %rcx +; BMI2-SLOW-NEXT: notb %dl +; BMI2-SLOW-NEXT: shrq %rsi +; BMI2-SLOW-NEXT: shrxq %rdx, %rsi, %rax +; BMI2-SLOW-NEXT: orq %rcx, %rax +; BMI2-SLOW-NEXT: retq +; +; BMI2-FAST-LABEL: lshift: +; BMI2-FAST: # %bb.0: # %entry +; BMI2-FAST-NEXT: movl %edx, %ecx +; BMI2-FAST-NEXT: movq %rdi, %rax +; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI2-FAST-NEXT: shldq %cl, %rsi, %rax +; BMI2-FAST-NEXT: retq entry: %sh_prom = zext i32 %c to i64 %shl = shl i64 %a, %sh_prom @@ -100,15 +108,23 @@ define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone { ; BMI-NEXT: orq %rdi, %rax ; BMI-NEXT: retq ; -; BMI2-LABEL: rshift: -; BMI2: # %bb.0: # %entry -; BMI2-NEXT: # kill: def $edx killed $edx def $rdx -; BMI2-NEXT: shrxq %rdx, %rdi, %rcx -; BMI2-NEXT: notb %dl -; BMI2-NEXT: addq %rsi, %rsi -; BMI2-NEXT: shlxq %rdx, %rsi, %rax -; BMI2-NEXT: orq %rcx, %rax -; BMI2-NEXT: retq +; BMI2-SLOW-LABEL: rshift: +; BMI2-SLOW: # %bb.0: # %entry +; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx +; BMI2-SLOW-NEXT: shrxq %rdx, %rdi, %rcx +; BMI2-SLOW-NEXT: notb %dl +; BMI2-SLOW-NEXT: addq %rsi, %rsi +; BMI2-SLOW-NEXT: shlxq %rdx, %rsi, %rax +; BMI2-SLOW-NEXT: orq %rcx, %rax +; BMI2-SLOW-NEXT: retq +; +; BMI2-FAST-LABEL: rshift: +; BMI2-FAST: # %bb.0: # %entry +; BMI2-FAST-NEXT: movl %edx, %ecx +; BMI2-FAST-NEXT: movq %rdi, %rax +; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI2-FAST-NEXT: shrdq %cl, %rsi, %rax +; BMI2-FAST-NEXT: retq entry: %sh_prom = zext i32 %c to i64 %shr = lshr i64 %a, %sh_prom