Skip to content

Commit ee6edcf

Browse files
[X86] Support vectorized llvm.fmaximum/fminimum.vXf16 lowering
Support the lowering of vectorized FMINIMUM and FMAXIMUM to vminph and vmaxph on types v8f16, v16f16 when AVX512FP, AVX512VL flags are set, and on type v32f16 when AVX512FP flag is set.
1 parent dcb7f44 commit ee6edcf

File tree

2 files changed

+66
-25
lines changed

2 files changed

+66
-25
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,6 +2333,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
23332333

23342334
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
23352335
setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2336+
2337+
setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2338+
setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
23362339
}
23372340

23382341
if (Subtarget.hasVLX()) {
@@ -2377,6 +2380,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
23772380
// Need to custom widen these to prevent scalarization.
23782381
setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
23792382
setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2383+
2384+
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2385+
setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2386+
2387+
setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2388+
setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
23802389
}
23812390
}
23822391

llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ declare half @llvm.minimum.f16(half, half)
55
declare half @llvm.maximum.f16(half, half)
66
declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
77
declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
8+
declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
9+
declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
10+
declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
11+
declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
812

913
define half @test_fminimum(half %x, half %y) {
1014
; CHECK-LABEL: test_fminimum:
@@ -25,20 +29,10 @@ define half @test_fminimum(half %x, half %y) {
2529
ret half %z
2630
}
2731

28-
define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
29-
; CHECK-LABEL: test_fminimum_scalarize:
32+
define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
33+
; CHECK-LABEL: test_fminimum_v8f16:
3034
; CHECK: # %bb.0:
31-
; CHECK-NEXT: vcmpltph %xmm1, %xmm0, %k1
32-
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
33-
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
34-
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm0, %k1
35-
; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
36-
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
37-
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
38-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
39-
; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1
40-
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1}
41-
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
35+
; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0
4236
; CHECK-NEXT: retq
4337
%r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
4438
ret <8 x half> %r
@@ -113,19 +107,10 @@ define half @test_fmaximum(half %x, half %y) {
113107
ret half %r
114108
}
115109

116-
define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
117-
; CHECK-LABEL: test_fmaximum_scalarize:
110+
define <8 x half> @test_fmaximum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
111+
; CHECK-LABEL: test_fmaximum_v8f16:
118112
; CHECK: # %bb.0:
119-
; CHECK-NEXT: vcmpltph %xmm0, %xmm1, %k1
120-
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
121-
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
122-
; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
123-
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
124-
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
125-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
126-
; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1
127-
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1}
128-
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
113+
; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0
129114
; CHECK-NEXT: retq
130115
%r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
131116
ret <8 x half> %r
@@ -186,3 +171,50 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) {
186171
%2 = tail call half @llvm.maximum.f16(half %x, half %1)
187172
ret half %2
188173
}
174+
175+
define <16 x half> @test_fminimum_v16f16(<16 x half> %x, <16 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
176+
; CHECK-LABEL: test_fminimum_v16f16:
177+
; CHECK: # %bb.0:
178+
; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0
179+
; CHECK-NEXT: retq
180+
%r = call <16 x half> @llvm.minimum.v16f16(<16 x half> %x, <16 x half> %y)
181+
ret <16 x half> %r
182+
}
183+
184+
define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "no-signed-zeros-fp-math"="true" {
185+
; CHECK-LABEL: test_fmaximum_v16f16_nans:
186+
; CHECK: # %bb.0:
187+
; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm1
188+
; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1
189+
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
190+
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
191+
; CHECK-NEXT: retq
192+
%r = call <16 x half> @llvm.maximum.v16f16(<16 x half> %x, <16 x half> %y)
193+
ret <16 x half> %r
194+
}
195+
196+
define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
197+
; CHECK-LABEL: test_fminimum_v32f16_szero:
198+
; CHECK: # %bb.0:
199+
; CHECK-NEXT: vpmovw2m %zmm0, %k1
200+
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
201+
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
202+
; CHECK-NEXT: vminph %zmm2, %zmm0, %zmm0
203+
; CHECK-NEXT: retq
204+
%r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
205+
ret <32 x half> %r
206+
}
207+
208+
define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
209+
; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
210+
; CHECK: # %bb.0:
211+
; CHECK-NEXT: vpmovw2m %zmm0, %k1
212+
; CHECK-NEXT: vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
213+
; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
214+
; CHECK-NEXT: vmaxph %zmm2, %zmm1, %zmm0
215+
; CHECK-NEXT: vcmpunordph %zmm1, %zmm1, %k1
216+
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
217+
; CHECK-NEXT: retq
218+
%r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
219+
ret <32 x half> %r
220+
}

0 commit comments

Comments
 (0)