Skip to content

Commit 5aa1275

Browse files
authored
[X86] Support SM4 EVEX version intrinsics/instructions. (#113402)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
1 parent 39ac64c commit 5aa1275

File tree

17 files changed

+1232
-0
lines changed

17 files changed

+1232
-0
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,10 @@ X86 Support
628628
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
629629
``*_(mask(z)))_minmax_s[s|d|h]``.
630630

631+
- Supported intrinsics for ``SM4 and AVX10.2``.
632+
* Supported SM4 intrinsics of ``_mm512_sm4key4_epi32`` and
633+
``_mm512_sm4rnds4_epi32``.
634+
631635
- All intrinsics in adcintrin.h can now be used in constant expressions.
632636

633637
- All intrinsics in adxintrin.h can now be used in constant expressions.

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,6 +2179,10 @@ TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
21792179
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
21802180
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
21812181

2182+
// SM4_EVEX
2183+
TARGET_BUILTIN(__builtin_ia32_vsm4key4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
2184+
TARGET_BUILTIN(__builtin_ia32_vsm4rnds4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
2185+
21822186
// AVX10 MINMAX
21832187
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
21842188
TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ set(x86_files
243243
shaintrin.h
244244
sm3intrin.h
245245
sm4intrin.h
246+
sm4evexintrin.h
246247
smmintrin.h
247248
tbmintrin.h
248249
tmmintrin.h

clang/lib/Headers/immintrin.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,11 @@ _storebe_i64(void * __P, long long __D) {
677677
#include <avx10_2_512satcvtintrin.h>
678678
#endif
679679

680+
#if !defined(__SCE__) || __has_feature(modules) || \
681+
(defined(__AVX10_2_512__) && defined(__SM4__))
682+
#include <sm4evexintrin.h>
683+
#endif
684+
680685
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
681686
#include <enqcmdintrin.h>
682687
#endif

clang/lib/Headers/sm4evexintrin.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*===--------------- sm4evexintrin.h - SM4 EVEX intrinsics -----------------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===----------------------------------------------------------------------===
8+
*/
9+
#ifndef __IMMINTRIN_H
10+
#error "Never use <sm4evexintrin.h> directly; include <immintrin.h> instead."
11+
#endif // __IMMINTRIN_H
12+
13+
#ifndef __SM4EVEXINTRIN_H
14+
#define __SM4EVEXINTRIN_H
15+
16+
#define __DEFAULT_FN_ATTRS512 \
17+
__attribute__((__always_inline__, __nodebug__, \
18+
__target__("sm4,avx10.2-512"), __min_vector_width__(512)))
19+
20+
static __inline__ __m512i __DEFAULT_FN_ATTRS512
21+
_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
22+
return (__m512i)__builtin_ia32_vsm4key4512((__v16su)__A, (__v16su)__B);
23+
}
24+
25+
static __inline__ __m512i __DEFAULT_FN_ATTRS512
26+
_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
27+
return (__m512i)__builtin_ia32_vsm4rnds4512((__v16su)__A, (__v16su)__B);
28+
}
29+
30+
#undef __DEFAULT_FN_ATTRS512
31+
32+
#endif // __SM4EVEXINTRIN_H
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-- -target-feature +sm4 \
2+
// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror | FileCheck %s
3+
// RUN: %clang_cc1 %s -ffreestanding -triple=i386-- -target-feature +sm4 \
4+
// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror | FileCheck %s
5+
6+
#include <immintrin.h>
7+
#include <stddef.h>
8+
9+
__m512i test_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
10+
// CHECK-LABEL: @test_mm512_sm4key4_epi32(
11+
// CHECK: call <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
12+
return _mm512_sm4key4_epi32(__A, __B);
13+
}
14+
15+
__m512i test_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
16+
// CHECK-LABEL: @test_mm512_sm4rnds4_epi32(
17+
// CHECK: call <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
18+
return _mm512_sm4rnds4_epi32(__A, __B);
19+
}

llvm/docs/ReleaseNotes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ Changes to the X86 Backend
219219

220220
* Supported instructions of `MOVRS AND AVX10.2`
221221

222+
* Supported ISA of `SM4(EVEX)`.
223+
222224
Changes to the OCaml bindings
223225
-----------------------------
224226

llvm/include/llvm/IR/IntrinsicsX86.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6099,6 +6099,11 @@ let TargetPrefix = "x86" in {
60996099
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
61006100
[llvm_v8i32_ty, llvm_v8i32_ty],
61016101
[IntrNoMem]>;
6102+
def int_x86_vsm4key4512
6103+
: ClangBuiltin<"__builtin_ia32_vsm4key4512">,
6104+
DefaultAttrsIntrinsic<[llvm_v16i32_ty],
6105+
[llvm_v16i32_ty, llvm_v16i32_ty],
6106+
[IntrNoMem]>;
61026107
def int_x86_vsm4rnds4128
61036108
: ClangBuiltin<"__builtin_ia32_vsm4rnds4128">,
61046109
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
@@ -6109,6 +6114,11 @@ let TargetPrefix = "x86" in {
61096114
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
61106115
[llvm_v8i32_ty, llvm_v8i32_ty],
61116116
[IntrNoMem]>;
6117+
def int_x86_vsm4rnds4512
6118+
: ClangBuiltin<"__builtin_ia32_vsm4rnds4512">,
6119+
DefaultAttrsIntrinsic<[llvm_v16i32_ty],
6120+
[llvm_v16i32_ty, llvm_v16i32_ty],
6121+
[IntrNoMem]>;
61126122
}
61136123
//===----------------------------------------------------------------------===//
61146124
// RAO-INT intrinsics

llvm/lib/Target/X86/X86InstrAVX10.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1675,3 +1675,17 @@ defm VMOVRSD : vmovrs_p_vl<0x6f, "vmovrsd", avx512vl_i32_info>,
16751675
T_MAP5, XS, EVEX_CD8<32, CD8VF>, Sched<[WriteVecLoad]>;
16761676
defm VMOVRSQ : vmovrs_p_vl<0x6f, "vmovrsq", avx512vl_i64_info>,
16771677
T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VF>, Sched<[WriteVecLoad]>;
1678+
1679+
// SM4(EVEX)
1680+
multiclass avx10_sm4_base<string OpStr> {
1681+
// SM4_Base is in X86InstrSSE.td.
1682+
let Predicates = [HasSM4, HasAVX10_2], AddedComplexity = 1 in {
1683+
defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128;
1684+
defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256;
1685+
}
1686+
let Predicates = [HasSM4, HasAVX10_2_512] in
1687+
defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512;
1688+
}
1689+
1690+
defm VSM4KEY4 : avx10_sm4_base<"vsm4key4">, T8, XS, EVEX, VVVV;
1691+
defm VSM4RNDS4 : avx10_sm4_base<"vsm4rnds4">, T8, XD, EVEX, VVVV;
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s
3+
; RUN: llc < %s -verify-machineinstrs -mtriple=i686-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s
4+
5+
define <4 x i32> @test_int_x86_vsm4key4128(<4 x i32> %A, <4 x i32> %B) {
6+
; CHECK-LABEL: test_int_x86_vsm4key4128:
7+
; CHECK: # %bb.0:
8+
; CHECK-NEXT: vsm4key4 %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0xda,0xc1]
9+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
10+
%ret = call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)
11+
ret <4 x i32> %ret
12+
}
13+
declare <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %A, <4 x i32> %B)
14+
15+
define <8 x i32> @test_int_x86_vsm4key4256(<8 x i32> %A, <8 x i32> %B) {
16+
; CHECK-LABEL: test_int_x86_vsm4key4256:
17+
; CHECK: # %bb.0:
18+
; CHECK-NEXT: vsm4key4 %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7e,0xda,0xc1]
19+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
20+
%ret = call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)
21+
ret <8 x i32> %ret
22+
}
23+
declare <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %A, <8 x i32> %B)
24+
25+
define <16 x i32> @test_int_x86_vsm4key4512(<16 x i32> %A, <16 x i32> %B) {
26+
; CHECK-LABEL: test_int_x86_vsm4key4512:
27+
; CHECK: # %bb.0:
28+
; CHECK-NEXT: vsm4key4 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0xda,0xc1]
29+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
30+
%ret = call <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %A, <16 x i32> %B)
31+
ret <16 x i32> %ret
32+
}
33+
declare <16 x i32> @llvm.x86.vsm4key4512(<16 x i32> %A, <16 x i32> %B)
34+
35+
define <4 x i32> @test_int_x86_vsm4rnds4128(<4 x i32> %A, <4 x i32> %B) {
36+
; CHECK-LABEL: test_int_x86_vsm4rnds4128:
37+
; CHECK: # %bb.0:
38+
; CHECK-NEXT: vsm4rnds4 %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0xda,0xc1]
39+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
40+
%ret = call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)
41+
ret <4 x i32> %ret
42+
}
43+
declare <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %A, <4 x i32> %B)
44+
45+
define <8 x i32> @test_int_x86_vsm4rnds4256(<8 x i32> %A, <8 x i32> %B) {
46+
; CHECK-LABEL: test_int_x86_vsm4rnds4256:
47+
; CHECK: # %bb.0:
48+
; CHECK-NEXT: vsm4rnds4 %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7f,0xda,0xc1]
49+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
50+
%ret = call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)
51+
ret <8 x i32> %ret
52+
}
53+
declare <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %A, <8 x i32> %B)
54+
55+
define <16 x i32> @test_int_x86_vsm4rnds4512(<16 x i32> %A, <16 x i32> %B) {
56+
; CHECK-LABEL: test_int_x86_vsm4rnds4512:
57+
; CHECK: # %bb.0:
58+
; CHECK-NEXT: vsm4rnds4 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0xda,0xc1]
59+
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
60+
%ret = call <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %A, <16 x i32> %B)
61+
ret <16 x i32> %ret
62+
}
63+
declare <16 x i32> @llvm.x86.vsm4rnds4512(<16 x i32> %A, <16 x i32> %B)
64+

0 commit comments

Comments
 (0)