Skip to content

Commit 00faf91

Browse files
[X86][CodeGen] - Use shift operators instead of built-ins for SSE emulation of MMX intrinsics
When performing constant value shifts, the generated code using SSE emulation via intrinsics is less efficient than using standard left/right shift operators. allow for better performance by using operators instead of built-ins.
1 parent f303f37 commit 00faf91

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

clang/lib/Headers/mmintrin.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -880,11 +880,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
880880
/// A 32-bit integer value.
881881
/// \returns A 64-bit integer vector containing the left-shifted value. If
882882
/// \a __count is greater or equal to 64, the result is set to 0.
883-
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
884-
_mm_slli_si64(__m64 __m, int __count)
885-
{
886-
return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
887-
__count));
883+
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m,
884+
int __count) {
885+
if (__builtin_constant_p(__count))
886+
return (__m64)((__count > 63) ? 0 : ((long long)__m << __count));
887+
return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), __count));
888888
}
889889

890890
/// Right-shifts each 16-bit integer element of the first parameter,
@@ -1115,11 +1115,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
11151115
/// \param __count
11161116
/// A 32-bit integer value.
11171117
/// \returns A 64-bit integer vector containing the right-shifted value.
1118-
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1119-
_mm_srli_si64(__m64 __m, int __count)
1120-
{
1121-
return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
1122-
__count));
1118+
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m,
1119+
int __count) {
1120+
if (__builtin_constant_p(__count))
1121+
return (__m64)((__count > 63) ? 0 : ((long long)__m >> __count));
1122+
return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), __count));
11231123
}
11241124

11251125
/// Performs a bitwise AND of two 64-bit integer vectors.

clang/test/CodeGen/X86/mmx-builtins.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,13 @@ __m64 test_mm_slli_pi32(__m64 a) {
563563

564564
__m64 test_mm_slli_si64(__m64 a) {
565565
// CHECK-LABEL: test_mm_slli_si64
566-
// CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
566+
// CHECK: %__m.addr.i = alloca <1 x i64>, align 8
567+
// CHECK: %__count.addr.i = alloca i32, align 4
568+
// CHECK: %5 = load <1 x i64>, ptr %__m.addr.i, align 8
569+
// CHECK: %6 = bitcast <1 x i64> %5 to i64
570+
// CHECK: %7 = load i32, ptr %__count.addr.i, align 4
571+
// CHECK: %sh_prom.i = zext i32 %7 to i64
572+
// CHECK: %shl.i = shl i64 %6, %sh_prom.i
567573
return _mm_slli_si64(a, 3);
568574
}
569575

@@ -623,7 +629,13 @@ __m64 test_mm_srli_pi32(__m64 a) {
623629

624630
__m64 test_mm_srli_si64(__m64 a) {
625631
// CHECK-LABEL: test_mm_srli_si64
626-
// CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
632+
// CHECK: %__m.addr.i = alloca <1 x i64>, align 8
633+
// CHECK: %__count.addr.i = alloca i32, align 4
634+
// CHECK: %5 = load <1 x i64>, ptr %__m.addr.i, align 8
635+
// CHECK: %6 = bitcast <1 x i64> %5 to i64
636+
// CHECK: %7 = load i32, ptr %__count.addr.i, align 4
637+
// CHECK: %sh_prom.i = zext i32 %7 to i64
638+
// CHECK: %shr.i = ashr i64 %6, %sh_prom.i
627639
return _mm_srli_si64(a, 3);
628640
}
629641

0 commit comments

Comments
 (0)