Skip to content

Commit c1a3960

Browse files
[X86] Add APX imulzu support. (llvm#116806)
Add patterns to select 16b imulzu with -mapx-feature=zu, including folding of zero-extends of the result. IsDesirableToPromoteOp is changed to leave 16b multiplies by constant un-promoted, as imulzu will not cause partial-write stalls.
1 parent 2ab84a6 commit c1a3960

File tree

4 files changed

+256
-1
lines changed

4 files changed

+256
-1
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59285,6 +59285,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
5928559285
return Ld->getBasePtr() == St->getBasePtr();
5928659286
};
5928759287

59288+
auto IsFoldableZext = [](SDValue Op) {
59289+
if (!Op.hasOneUse())
59290+
return false;
59291+
SDNode *User = *Op->use_begin();
59292+
EVT VT = User->getValueType(0);
59293+
return (User->getOpcode() == ISD::ZERO_EXTEND &&
59294+
(VT == MVT::i32 || VT == MVT::i64));
59295+
};
59296+
5928859297
bool Commute = false;
5928959298
switch (Op.getOpcode()) {
5929059299
default: return false;
@@ -59301,8 +59310,15 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
5930159310
return false;
5930259311
break;
5930359312
}
59304-
case ISD::ADD:
5930559313
case ISD::MUL:
59314+
// When ZU is enabled, we prefer to not promote for MUL by a constant
59315+
// when there is an opportunity to fold a zext with imulzu.
59316+
if (Subtarget.hasZU() && IsFoldableZext(Op) &&
59317+
(isa<ConstantSDNode>(Op.getOperand(0)) ||
59318+
isa<ConstantSDNode>(Op.getOperand(1))))
59319+
return false;
59320+
[[fallthrough]];
59321+
case ISD::ADD:
5930659322
case ISD::AND:
5930759323
case ISD::OR:
5930859324
case ISD::XOR:

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,6 +2184,18 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
21842184
defm : EFLAGSDefiningPats<"", NoNDD>;
21852185
defm : EFLAGSDefiningPats<"_ND", HasNDD>;
21862186

2187+
let Predicates = [HasZU] in {
2188+
// zext (mul reg/mem, imm) -> imulzu
2189+
def : Pat<(i32 (zext (i16 (mul GR16:$src1, imm:$src2)))),
2190+
(SUBREG_TO_REG (i32 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
2191+
def : Pat<(i32 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
2192+
(SUBREG_TO_REG (i32 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
2193+
def : Pat<(i64 (zext (i16 (mul GR16:$src1, imm:$src2)))),
2194+
(SUBREG_TO_REG (i64 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
2195+
def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
2196+
(SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
2197+
}
2198+
21872199
// mul reg, imm
21882200
def : Pat<(mul GR16:$src1, imm:$src2),
21892201
(IMUL16rri GR16:$src1, imm:$src2)>;

llvm/lib/Target/X86/X86InstrPredicates.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def NoEGPR : Predicate<"!Subtarget->hasEGPR()">;
4545
// entries, so that the NDD variant can be selected first to benefit RA.
4646
def HasNDD : Predicate<"Subtarget->hasNDD()">;
4747
def NoNDD : Predicate<"!Subtarget->hasNDD()">;
48+
def HasZU : Predicate<"Subtarget->hasZU()">;
4849
def HasCF : Predicate<"Subtarget->hasCF()">;
4950
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
5051
def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefixes=CHECK,ZU
3+
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOZU
4+
5+
; Test generation of 16b imulzu when -mattr=+zu is specified.
6+
; The mulzu_* tests check for basic generation, which is limited to cases where a
7+
; zero-extend of the result can be folded into imulzu.
8+
; The remaining tests are modifications of selected test/CodeGen/X86/imul.ll tests with
9+
; 16b multiplies, to check that common strength reductions in ISel are still performed
10+
; when -mattr=+zu is in effect.
11+
;
12+
; FIXME: several cases from imul.ll covering DAG combines, in particular those using LEA,
13+
; are not ported as X86's IsDesirableToPromoteOp has no way to accurately identify when
14+
; promotion will permit a better sequence than an unpromoted imulzu.
15+
; These cases should be added when they are implemented.
16+
17+
define i32 @mulzu_16_32(i16 %A) {
18+
; ZU-LABEL: mulzu_16_32:
19+
; ZU: # %bb.0:
20+
; ZU-NEXT: imulzuw $1234, %di, %ax # imm = 0x4D2
21+
; ZU-NEXT: retq
22+
;
23+
; NOZU-LABEL: mulzu_16_32:
24+
; NOZU: # %bb.0:
25+
; NOZU-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
26+
; NOZU-NEXT: movzwl %ax, %eax
27+
; NOZU-NEXT: retq
28+
%mul = mul i16 %A, 1234
29+
%r = zext i16 %mul to i32
30+
ret i32 %r
31+
}
32+
33+
define i64 @mulzu_16_64(i16 %A) {
34+
; ZU-LABEL: mulzu_16_64:
35+
; ZU: # %bb.0:
36+
; ZU-NEXT: imulzuw $1234, %di, %ax # imm = 0x4D2
37+
; ZU-NEXT: retq
38+
;
39+
; NOZU-LABEL: mulzu_16_64:
40+
; NOZU: # %bb.0:
41+
; NOZU-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
42+
; NOZU-NEXT: movzwl %ax, %eax
43+
; NOZU-NEXT: retq
44+
%mul = mul i16 %A, 1234
45+
%r = zext i16 %mul to i64
46+
ret i64 %r
47+
}
48+
49+
define i32 @mulzu_16_32_mem(ptr %P) {
50+
; ZU-LABEL: mulzu_16_32_mem:
51+
; ZU: # %bb.0:
52+
; ZU-NEXT: imulzuw $1234, (%rdi), %ax # imm = 0x4D2
53+
; ZU-NEXT: retq
54+
;
55+
; NOZU-LABEL: mulzu_16_32_mem:
56+
; NOZU: # %bb.0:
57+
; NOZU-NEXT: movzwl (%rdi), %eax
58+
; NOZU-NEXT: imull $1234, %eax, %eax # imm = 0x4D2
59+
; NOZU-NEXT: movzwl %ax, %eax
60+
; NOZU-NEXT: retq
61+
%gep = getelementptr i16, ptr %P, i64 0
62+
%A = load i16, ptr %gep
63+
%mul = mul i16 %A, 1234
64+
%r = zext i16 %mul to i32
65+
ret i32 %r
66+
}
67+
68+
define i64 @mulzu_16_64_mem(ptr %P) {
69+
; ZU-LABEL: mulzu_16_64_mem:
70+
; ZU: # %bb.0:
71+
; ZU-NEXT: imulzuw $1234, (%rdi), %ax # imm = 0x4D2
72+
; ZU-NEXT: retq
73+
;
74+
; NOZU-LABEL: mulzu_16_64_mem:
75+
; NOZU: # %bb.0:
76+
; NOZU-NEXT: movzwl (%rdi), %eax
77+
; NOZU-NEXT: imull $1234, %eax, %eax # imm = 0x4D2
78+
; NOZU-NEXT: movzwl %ax, %eax
79+
; NOZU-NEXT: retq
80+
%gep = getelementptr i16, ptr %P, i64 0
81+
%A = load i16, ptr %gep
82+
%mul = mul i16 %A, 1234
83+
%r = zext i16 %mul to i64
84+
ret i64 %r
85+
}
86+
87+
; The following mulzu cases check that imulzu is not
88+
; generated in the absence of a single zext user. The ZU/NOZU
89+
; cases should match.
90+
91+
define void @mulzu_16_store(i16 %A, ptr %R) {
92+
; CHECK-LABEL: mulzu_16_store:
93+
; CHECK: # %bb.0:
94+
; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
95+
; CHECK-NEXT: movw %ax, (%rsi)
96+
; CHECK-NEXT: retq
97+
%gep = getelementptr i16, ptr %R, i64 0
98+
%mul = mul i16 %A, 1234
99+
store i16 %mul, ptr %gep
100+
ret void
101+
}
102+
103+
define i32 @mulzu_16_store_32(i16 %A, ptr %R) {
104+
; CHECK-LABEL: mulzu_16_store_32:
105+
; CHECK: # %bb.0:
106+
; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
107+
; CHECK-NEXT: movw %ax, (%rsi)
108+
; CHECK-NEXT: movzwl %ax, %eax
109+
; CHECK-NEXT: retq
110+
%gep = getelementptr i16, ptr %R, i64 0
111+
%mul = mul i16 %A, 1234
112+
store i16 %mul, ptr %gep
113+
%r = zext i16 %mul to i32
114+
ret i32 %r
115+
}
116+
117+
define i64 @mulzu_16_store_64(i16 %A, ptr %R) {
118+
; CHECK-LABEL: mulzu_16_store_64:
119+
; CHECK: # %bb.0:
120+
; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
121+
; CHECK-NEXT: movw %ax, (%rsi)
122+
; CHECK-NEXT: movzwl %ax, %eax
123+
; CHECK-NEXT: retq
124+
%gep = getelementptr i16, ptr %R, i64 0
125+
%mul = mul i16 %A, 1234
126+
store i16 %mul, ptr %gep
127+
%r = zext i16 %mul to i64
128+
ret i64 %r
129+
}
130+
131+
define i32 @mulzu_sext_16_32(i16 %A) {
132+
; CHECK-LABEL: mulzu_sext_16_32:
133+
; CHECK: # %bb.0:
134+
; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
135+
; CHECK-NEXT: cwtl
136+
; CHECK-NEXT: retq
137+
%mul = mul i16 %A, 1234
138+
%r = sext i16 %mul to i32
139+
ret i32 %r
140+
}
141+
142+
define i64 @mulzu_sext_16_64(i16 %A) {
143+
; CHECK-LABEL: mulzu_sext_16_64:
144+
; CHECK: # %bb.0:
145+
; CHECK-NEXT: imull $1234, %edi, %eax # imm = 0x4D2
146+
; CHECK-NEXT: movswq %ax, %rax
147+
; CHECK-NEXT: retq
148+
%mul = mul i16 %A, 1234
149+
%r = sext i16 %mul to i64
150+
ret i64 %r
151+
}
152+
153+
; Tests ported from test/CodeGen/X86/imul.ll follow from this point.
154+
; The generated code, which strength-reduces multiplies by certain
155+
; constants, should be unaffected by enabling zu.
156+
157+
define i16 @mul4_16(i16 %A) {
158+
;
159+
; CHECK-LABEL: mul4_16:
160+
; CHECK: # %bb.0:
161+
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
162+
; CHECK-NEXT: leal (,%rdi,4), %eax
163+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
164+
; CHECK-NEXT: retq
165+
%mul = mul i16 %A, 4
166+
ret i16 %mul
167+
}
168+
169+
define i16 @mul4096_16(i16 %A) {
170+
;
171+
; CHECK-LABEL: mul4096_16:
172+
; CHECK: # %bb.0:
173+
; CHECK-NEXT: movl %edi, %eax
174+
; CHECK-NEXT: shll $12, %eax
175+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
176+
; CHECK-NEXT: retq
177+
%mul = mul i16 %A, 4096
178+
ret i16 %mul
179+
}
180+
181+
define i16 @mulmin4096_16(i16 %A) {
182+
;
183+
; CHECK-LABEL: mulmin4096_16:
184+
; CHECK: # %bb.0:
185+
; CHECK-NEXT: movl %edi, %eax
186+
; CHECK-NEXT: shll $12, %eax
187+
; CHECK-NEXT: negl %eax
188+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
189+
; CHECK-NEXT: retq
190+
%mul = mul i16 %A, -4096
191+
ret i16 %mul
192+
}
193+
194+
define i16 @mul4_16_minsize(i16 %A) minsize {
195+
;
196+
; CHECK-LABEL: mul4_16_minsize:
197+
; CHECK: # %bb.0:
198+
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
199+
; CHECK-NEXT: leal (,%rdi,4), %eax
200+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
201+
; CHECK-NEXT: retq
202+
%mul = mul i16 %A, 4
203+
ret i16 %mul
204+
}
205+
206+
define i16 @mul0_16(i16 %A) {
207+
;
208+
; CHECK-LABEL: mul0_16:
209+
; CHECK: # %bb.0:
210+
; CHECK-NEXT: xorl %eax, %eax
211+
; CHECK-NEXT: retq
212+
%mul = mul i16 %A, 0
213+
ret i16 %mul
214+
}
215+
216+
define i16 @mul4294967295_16(i16 %A) {
217+
;
218+
; CHECK-LABEL: mul4294967295_16:
219+
; CHECK: # %bb.0:
220+
; CHECK-NEXT: movl %edi, %eax
221+
; CHECK-NEXT: negl %eax
222+
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
223+
; CHECK-NEXT: retq
224+
%mul = mul i16 %A, 4294967295
225+
ret i16 %mul
226+
}

0 commit comments

Comments
 (0)