Skip to content

Commit 761be78

Browse files
authored
[WebAssembly] recognize saturating truncation (#155470)
fixes llvm/llvm-project#153838 using the same approach as llvm/llvm-project#155377 Recognize a manual saturating truncation and select the corresponding instruction. This is useful in general, but came up specifically in https://github.com/rust-lang/stdarch because it will allow us to drop more target-specific intrinsics in favor of cross-platform ones.
1 parent 40fbe32 commit 761be78

File tree

3 files changed

+132
-46
lines changed

3 files changed

+132
-46
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,49 @@ def : Pat<(v16i8 (wasm_narrow_u (v8i16 V128:$left), (v8i16 V128:$right))),
14451445
def : Pat<(v8i16 (wasm_narrow_u (v4i32 V128:$left), (v4i32 V128:$right))),
14461446
(NARROW_U_I16x8 $left, $right)>;
14471447

1448+
// Recognize a saturating truncation and convert into the corresponding
1449+
// narrow_TYPE_s or narrow_TYPE_u instruction.
1450+
multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
1451+
Instruction narrow, int minval,
1452+
int maxval, int mask> {
1453+
def : Pat<
1454+
(output (wasm_narrow_u
1455+
(and (smin (smax (input V128:$a), (splat_vector (i32 minval))),
1456+
(splat_vector (i32 maxval))), (splat_vector (i32 mask))),
1457+
(and (smin (smax (input V128:$b), (splat_vector (i32 minval))),
1458+
(splat_vector (i32 maxval))), (splat_vector (i32 mask)))
1459+
)),
1460+
(narrow V128:$a, V128:$b)
1461+
>;
1462+
1463+
def : Pat<
1464+
(output (wasm_narrow_u
1465+
(and (smax (smin (input V128:$a), (splat_vector (i32 maxval))),
1466+
(splat_vector (i32 minval))), (splat_vector (i32 mask))),
1467+
(and (smax (smin (input V128:$b), (splat_vector (i32 maxval))),
1468+
(splat_vector (i32 minval))), (splat_vector (i32 mask)))
1469+
)),
1470+
(narrow V128:$a, V128:$b)
1471+
>;
1472+
}
1473+
1474+
defm : SignedSaturatingTruncate<v8i16, v16i8, NARROW_S_I8x16, -128, 127, 0xFF>;
1475+
defm : SignedSaturatingTruncate<v4i32, v8i16, NARROW_S_I16x8, -32768, 32767, 0xFFFF>;
1476+
1477+
multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
1478+
Instruction narrow, int maxval> {
1479+
def : Pat<
1480+
(output (wasm_narrow_u
1481+
(umin (input V128:$a), (splat_vector (i32 maxval))),
1482+
(umin (input V128:$b), (splat_vector (i32 maxval)))
1483+
)),
1484+
(narrow V128:$a, V128:$b)
1485+
>;
1486+
}
1487+
1488+
defm : UnsignedSaturatingTruncate<v8i16, v16i8, NARROW_U_I8x16, 0xFF>;
1489+
defm : UnsignedSaturatingTruncate<v4i32, v8i16, NARROW_U_I16x8, 0xFFFF>;
1490+
14481491
// Bitcasts are nops
14491492
// Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
14501493
foreach t1 = AllVecs in

llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll

Lines changed: 2 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,6 @@ entry:
434434
define <8 x i16> @stest_f16i16(<8 x half> %x) {
435435
; CHECK-LABEL: stest_f16i16:
436436
; CHECK: .functype stest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
437-
; CHECK-NEXT: .local v128, v128, v128
438437
; CHECK-NEXT: # %bb.0: # %entry
439438
; CHECK-NEXT: local.get 5
440439
; CHECK-NEXT: call __truncsfhf2
@@ -474,15 +473,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
474473
; CHECK-NEXT: call __extendhfsf2
475474
; CHECK-NEXT: i32.trunc_sat_f32_s
476475
; CHECK-NEXT: i32x4.replace_lane 3
477-
; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767
478-
; CHECK-NEXT: local.tee 8
479-
; CHECK-NEXT: i32x4.min_s
480-
; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768
481-
; CHECK-NEXT: local.tee 9
482-
; CHECK-NEXT: i32x4.max_s
483-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
484-
; CHECK-NEXT: local.tee 10
485-
; CHECK-NEXT: v128.and
486476
; CHECK-NEXT: local.get 4
487477
; CHECK-NEXT: i32.trunc_sat_f32_s
488478
; CHECK-NEXT: i32x4.splat
@@ -495,13 +485,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
495485
; CHECK-NEXT: local.get 7
496486
; CHECK-NEXT: i32.trunc_sat_f32_s
497487
; CHECK-NEXT: i32x4.replace_lane 3
498-
; CHECK-NEXT: local.get 8
499-
; CHECK-NEXT: i32x4.min_s
500-
; CHECK-NEXT: local.get 9
501-
; CHECK-NEXT: i32x4.max_s
502-
; CHECK-NEXT: local.get 10
503-
; CHECK-NEXT: v128.and
504-
; CHECK-NEXT: i16x8.narrow_i32x4_u
488+
; CHECK-NEXT: i16x8.narrow_i32x4_s
505489
; CHECK-NEXT: # fallthrough-return
506490
entry:
507491
%conv = fptosi <8 x half> %x to <8 x i32>
@@ -516,7 +500,6 @@ entry:
516500
define <8 x i16> @utest_f16i16(<8 x half> %x) {
517501
; CHECK-LABEL: utest_f16i16:
518502
; CHECK: .functype utest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
519-
; CHECK-NEXT: .local v128
520503
; CHECK-NEXT: # %bb.0: # %entry
521504
; CHECK-NEXT: local.get 5
522505
; CHECK-NEXT: call __truncsfhf2
@@ -556,9 +539,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
556539
; CHECK-NEXT: call __extendhfsf2
557540
; CHECK-NEXT: i32.trunc_sat_f32_u
558541
; CHECK-NEXT: i32x4.replace_lane 3
559-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
560-
; CHECK-NEXT: local.tee 8
561-
; CHECK-NEXT: i32x4.min_u
562542
; CHECK-NEXT: local.get 4
563543
; CHECK-NEXT: i32.trunc_sat_f32_u
564544
; CHECK-NEXT: i32x4.splat
@@ -571,8 +551,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
571551
; CHECK-NEXT: local.get 7
572552
; CHECK-NEXT: i32.trunc_sat_f32_u
573553
; CHECK-NEXT: i32x4.replace_lane 3
574-
; CHECK-NEXT: local.get 8
575-
; CHECK-NEXT: i32x4.min_u
576554
; CHECK-NEXT: i16x8.narrow_i32x4_u
577555
; CHECK-NEXT: # fallthrough-return
578556
entry:
@@ -1861,7 +1839,6 @@ entry:
18611839
define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
18621840
; CHECK-LABEL: stest_f16i16_mm:
18631841
; CHECK: .functype stest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
1864-
; CHECK-NEXT: .local v128, v128, v128
18651842
; CHECK-NEXT: # %bb.0: # %entry
18661843
; CHECK-NEXT: local.get 5
18671844
; CHECK-NEXT: call __truncsfhf2
@@ -1901,15 +1878,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
19011878
; CHECK-NEXT: call __extendhfsf2
19021879
; CHECK-NEXT: i32.trunc_sat_f32_s
19031880
; CHECK-NEXT: i32x4.replace_lane 3
1904-
; CHECK-NEXT: v128.const 32767, 32767, 32767, 32767
1905-
; CHECK-NEXT: local.tee 8
1906-
; CHECK-NEXT: i32x4.min_s
1907-
; CHECK-NEXT: v128.const -32768, -32768, -32768, -32768
1908-
; CHECK-NEXT: local.tee 9
1909-
; CHECK-NEXT: i32x4.max_s
1910-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
1911-
; CHECK-NEXT: local.tee 10
1912-
; CHECK-NEXT: v128.and
19131881
; CHECK-NEXT: local.get 4
19141882
; CHECK-NEXT: i32.trunc_sat_f32_s
19151883
; CHECK-NEXT: i32x4.splat
@@ -1922,13 +1890,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
19221890
; CHECK-NEXT: local.get 7
19231891
; CHECK-NEXT: i32.trunc_sat_f32_s
19241892
; CHECK-NEXT: i32x4.replace_lane 3
1925-
; CHECK-NEXT: local.get 8
1926-
; CHECK-NEXT: i32x4.min_s
1927-
; CHECK-NEXT: local.get 9
1928-
; CHECK-NEXT: i32x4.max_s
1929-
; CHECK-NEXT: local.get 10
1930-
; CHECK-NEXT: v128.and
1931-
; CHECK-NEXT: i16x8.narrow_i32x4_u
1893+
; CHECK-NEXT: i16x8.narrow_i32x4_s
19321894
; CHECK-NEXT: # fallthrough-return
19331895
entry:
19341896
%conv = fptosi <8 x half> %x to <8 x i32>
@@ -1941,7 +1903,6 @@ entry:
19411903
define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
19421904
; CHECK-LABEL: utest_f16i16_mm:
19431905
; CHECK: .functype utest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
1944-
; CHECK-NEXT: .local v128
19451906
; CHECK-NEXT: # %bb.0: # %entry
19461907
; CHECK-NEXT: local.get 5
19471908
; CHECK-NEXT: call __truncsfhf2
@@ -1981,9 +1942,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
19811942
; CHECK-NEXT: call __extendhfsf2
19821943
; CHECK-NEXT: i32.trunc_sat_f32_u
19831944
; CHECK-NEXT: i32x4.replace_lane 3
1984-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
1985-
; CHECK-NEXT: local.tee 8
1986-
; CHECK-NEXT: i32x4.min_u
19871945
; CHECK-NEXT: local.get 4
19881946
; CHECK-NEXT: i32.trunc_sat_f32_u
19891947
; CHECK-NEXT: i32x4.splat
@@ -1996,8 +1954,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
19961954
; CHECK-NEXT: local.get 7
19971955
; CHECK-NEXT: i32.trunc_sat_f32_u
19981956
; CHECK-NEXT: i32x4.replace_lane 3
1999-
; CHECK-NEXT: local.get 8
2000-
; CHECK-NEXT: i32x4.min_u
20011957
; CHECK-NEXT: i16x8.narrow_i32x4_u
20021958
; CHECK-NEXT: # fallthrough-return
20031959
entry:
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
3+
; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
4+
5+
target triple = "wasm32-unknown-unknown"
6+
7+
declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) #2
8+
declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) #2
9+
10+
define <16 x i8> @i16_signed(<8 x i16> %a, <8 x i16> %b) {
11+
; CHECK-LABEL: i16_signed:
12+
; CHECK: .functype i16_signed (v128, v128) -> (v128)
13+
; CHECK-NEXT: # %bb.0: # %bb2
14+
; CHECK-NEXT: local.get 0
15+
; CHECK-NEXT: local.get 1
16+
; CHECK-NEXT: i8x16.narrow_i16x8_s
17+
; CHECK-NEXT: # fallthrough-return
18+
bb2:
19+
%0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
20+
%1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> splat (i16 -128))
21+
%2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
22+
%3 = trunc nsw <16 x i16> %2 to <16 x i8>
23+
ret <16 x i8> %3
24+
ret <16 x i8> %3
25+
}
26+
27+
define <8 x i16> @i32_signed(<4 x i32> %a, <4 x i32> %b) {
28+
; CHECK-LABEL: i32_signed:
29+
; CHECK: .functype i32_signed (v128, v128) -> (v128)
30+
; CHECK-NEXT: # %bb.0: # %bb2
31+
; CHECK-NEXT: local.get 0
32+
; CHECK-NEXT: local.get 1
33+
; CHECK-NEXT: i16x8.narrow_i32x4_s
34+
; CHECK-NEXT: # fallthrough-return
35+
bb2:
36+
%0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
37+
%1 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> splat (i32 -32768))
38+
%2 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %1, <8 x i32> splat (i32 32767))
39+
%3 = trunc nsw <8 x i32> %2 to <8 x i16>
40+
ret <8 x i16> %3
41+
}
42+
43+
define <8 x i16> @i32_signed_flipped(<4 x i32> %a, <4 x i32> %b) {
44+
; CHECK-LABEL: i32_signed_flipped:
45+
; CHECK: .functype i32_signed_flipped (v128, v128) -> (v128)
46+
; CHECK-NEXT: # %bb.0: # %bb2
47+
; CHECK-NEXT: local.get 0
48+
; CHECK-NEXT: local.get 1
49+
; CHECK-NEXT: i16x8.narrow_i32x4_s
50+
; CHECK-NEXT: # fallthrough-return
51+
bb2:
52+
%0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
53+
%1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> splat (i32 32767), <8 x i32> %0)
54+
%2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> splat (i32 -32768), <8 x i32> %1)
55+
%3 = trunc nsw <8 x i32> %2 to <8 x i16>
56+
ret <8 x i16> %3
57+
}
58+
59+
define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) {
60+
; CHECK-LABEL: i16_unsigned:
61+
; CHECK: .functype i16_unsigned (v128, v128) -> (v128)
62+
; CHECK-NEXT: # %bb.0: # %bb2
63+
; CHECK-NEXT: local.get 0
64+
; CHECK-NEXT: local.get 1
65+
; CHECK-NEXT: i8x16.narrow_i16x8_u
66+
; CHECK-NEXT: # fallthrough-return
67+
bb2:
68+
%0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
69+
%1 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> splat (i16 255))
70+
%2 = trunc nuw <16 x i16> %1 to <16 x i8>
71+
ret <16 x i8> %2
72+
}
73+
74+
define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) {
75+
; CHECK-LABEL: i32_unsigned:
76+
; CHECK: .functype i32_unsigned (v128, v128) -> (v128)
77+
; CHECK-NEXT: # %bb.0: # %bb2
78+
; CHECK-NEXT: local.get 0
79+
; CHECK-NEXT: local.get 1
80+
; CHECK-NEXT: i16x8.narrow_i32x4_u
81+
; CHECK-NEXT: # fallthrough-return
82+
bb2:
83+
%0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
84+
%1 = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535))
85+
%2 = trunc nsw <8 x i32> %1 to <8 x i16>
86+
ret <8 x i16> %2
87+
}

0 commit comments

Comments
 (0)