Skip to content

Commit 14c57ed

Browse files
committed
[X86] Allow znver3/4/5 targets to use double-shift instructions by default
While still not as fast as Intel targets, recent AMD znver3 + later CPUs are not as microcoded+bottlenecked as previous AMD targets (now only ~2cy rthroughput) which improves on the expanded 3*shift+not+or sequence we expand as an alternative. Noticed while triaging #132601
1 parent d536d13 commit 14c57ed

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1570,7 +1570,7 @@ def ProcessorFeatures {
15701570
FeatureVPCLMULQDQ];
15711571
list<SubtargetFeature> ZN3AdditionalTuning = [TuningMacroFusion];
15721572
list<SubtargetFeature> ZN3Tuning =
1573-
!listconcat(ZN2Tuning, ZN3AdditionalTuning);
1573+
!listremove(!listconcat(ZN2Tuning, ZN3AdditionalTuning), [TuningSlowSHLD]);
15741574
list<SubtargetFeature> ZN3Features =
15751575
!listconcat(ZN2Features, ZN3AdditionalFeatures);
15761576

llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=BMI
1313
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=BMI
1414
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=BMI
15-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2
16-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2
17-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2
18-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2
19-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2
20-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2
15+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2-SLOW
16+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2-SLOW
17+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2-SLOW
18+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2-FAST
19+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2-FAST
20+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2-FAST
2121

2222
; Verify that for the X86_64 processors that are known to have poor latency
2323
; double precision shift instructions we do not generate 'shld' or 'shrd'
@@ -53,15 +53,23 @@ define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
5353
; BMI-NEXT: orq %rdi, %rax
5454
; BMI-NEXT: retq
5555
;
56-
; BMI2-LABEL: lshift:
57-
; BMI2: # %bb.0: # %entry
58-
; BMI2-NEXT: # kill: def $edx killed $edx def $rdx
59-
; BMI2-NEXT: shlxq %rdx, %rdi, %rcx
60-
; BMI2-NEXT: notb %dl
61-
; BMI2-NEXT: shrq %rsi
62-
; BMI2-NEXT: shrxq %rdx, %rsi, %rax
63-
; BMI2-NEXT: orq %rcx, %rax
64-
; BMI2-NEXT: retq
56+
; BMI2-SLOW-LABEL: lshift:
57+
; BMI2-SLOW: # %bb.0: # %entry
58+
; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx
59+
; BMI2-SLOW-NEXT: shlxq %rdx, %rdi, %rcx
60+
; BMI2-SLOW-NEXT: notb %dl
61+
; BMI2-SLOW-NEXT: shrq %rsi
62+
; BMI2-SLOW-NEXT: shrxq %rdx, %rsi, %rax
63+
; BMI2-SLOW-NEXT: orq %rcx, %rax
64+
; BMI2-SLOW-NEXT: retq
65+
;
66+
; BMI2-FAST-LABEL: lshift:
67+
; BMI2-FAST: # %bb.0: # %entry
68+
; BMI2-FAST-NEXT: movl %edx, %ecx
69+
; BMI2-FAST-NEXT: movq %rdi, %rax
70+
; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
71+
; BMI2-FAST-NEXT: shldq %cl, %rsi, %rax
72+
; BMI2-FAST-NEXT: retq
6573
entry:
6674
%sh_prom = zext i32 %c to i64
6775
%shl = shl i64 %a, %sh_prom
@@ -100,15 +108,23 @@ define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
100108
; BMI-NEXT: orq %rdi, %rax
101109
; BMI-NEXT: retq
102110
;
103-
; BMI2-LABEL: rshift:
104-
; BMI2: # %bb.0: # %entry
105-
; BMI2-NEXT: # kill: def $edx killed $edx def $rdx
106-
; BMI2-NEXT: shrxq %rdx, %rdi, %rcx
107-
; BMI2-NEXT: notb %dl
108-
; BMI2-NEXT: addq %rsi, %rsi
109-
; BMI2-NEXT: shlxq %rdx, %rsi, %rax
110-
; BMI2-NEXT: orq %rcx, %rax
111-
; BMI2-NEXT: retq
111+
; BMI2-SLOW-LABEL: rshift:
112+
; BMI2-SLOW: # %bb.0: # %entry
113+
; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx
114+
; BMI2-SLOW-NEXT: shrxq %rdx, %rdi, %rcx
115+
; BMI2-SLOW-NEXT: notb %dl
116+
; BMI2-SLOW-NEXT: addq %rsi, %rsi
117+
; BMI2-SLOW-NEXT: shlxq %rdx, %rsi, %rax
118+
; BMI2-SLOW-NEXT: orq %rcx, %rax
119+
; BMI2-SLOW-NEXT: retq
120+
;
121+
; BMI2-FAST-LABEL: rshift:
122+
; BMI2-FAST: # %bb.0: # %entry
123+
; BMI2-FAST-NEXT: movl %edx, %ecx
124+
; BMI2-FAST-NEXT: movq %rdi, %rax
125+
; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
126+
; BMI2-FAST-NEXT: shrdq %cl, %rsi, %rax
127+
; BMI2-FAST-NEXT: retq
112128
entry:
113129
%sh_prom = zext i32 %c to i64
114130
%shr = lshr i64 %a, %sh_prom

0 commit comments

Comments
 (0)