Skip to content

Commit 6cb518d

Browse files
Orvidfacebook-github-bot
authored andcommitted
Eliminate the inline assembly in folly::Instructions
Summary: These were previously inline assembly to get GCC & Clang to inline them even when we weren't compiling with the required instruction set enabled, however these are now all included in the baseline instruction set we assume is present, so the intrinsics all get inlined properly. Switching over to the intrinsics also has the added benefit of allowing compile time folding that isn't possible with the inline assembly. This is particularly noticable with the bextr instruction, as the inline assembly version was actually generating worse code :( Reviewed By: Gownta Differential Revision: D74504609 fbshipit-source-id: d1a9ceac8522bd68213c9ff3cdfdc8cfa2ff1537
1 parent bd1f288 commit 6cb518d

File tree

1 file changed

+7
-41
lines changed

1 file changed

+7
-41
lines changed

folly/compression/Instructions.h

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@
1818

1919
#include <glog/logging.h>
2020

21-
#ifdef _MSC_VER
22-
#include <immintrin.h>
23-
#endif
24-
2521
#include <string_view>
2622

2723
#include <folly/CpuId.h>
2824
#include <folly/Portability.h>
2925
#include <folly/lang/Assume.h>
3026
#include <folly/portability/Builtins.h>
3127

28+
#if FOLLY_X64 || defined(__i386__)
29+
#include <immintrin.h>
30+
#endif
31+
3232
namespace folly {
3333
namespace compression {
3434
namespace instructions {
@@ -93,15 +93,8 @@ struct Nehalem : public Default {
9393
}
9494

9595
static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
96-
// POPCNT is supported starting with Intel Nehalem, AMD K10.
97-
#if defined(__GNUC__)
98-
// GCC and Clang won't inline the intrinsics.
99-
uint64_t result;
100-
asm("popcntq %1, %0" : "=r"(result) : "r"(value));
101-
return result;
102-
#else
96+
// POPCNT is supported starting with Intel Nehalem, AMD K10.
10397
return uint64_t(_mm_popcnt_u64(value));
104-
#endif
10598
}
10699
};
107100

@@ -113,45 +106,18 @@ struct Haswell : public Nehalem {
113106
}
114107

115108
static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
116-
// BMI1 is supported starting with Intel Haswell, AMD Piledriver.
117-
// BLSR combines two instructions into one and reduces register pressure.
118-
#if defined(__GNUC__)
119-
// GCC and Clang won't inline the intrinsics.
120-
uint64_t result;
121-
asm("blsrq %1, %0" : "=r"(result) : "r"(value));
122-
return result;
123-
#else
109+
// BMI1 is supported starting with Intel Haswell, AMD Piledriver.
110+
// BLSR combines two instructions into one and reduces register pressure.
124111
return _blsr_u64(value);
125-
#endif
126112
}
127113

128114
static FOLLY_ALWAYS_INLINE uint64_t
129115
bextr(uint64_t value, uint32_t start, uint32_t length) {
130-
#if defined(__GNUC__)
131-
// GCC and Clang won't inline the intrinsics.
132-
// Encode parameters in `pattern` where `pattern[0:7]` is `start` and
133-
// `pattern[8:15]` is `length`.
134-
// Ref: Intel Advanced Vector Extensions Programming Reference
135-
uint64_t pattern = start & 0xFF;
136-
pattern = pattern | ((length & 0xFF) << 8);
137-
uint64_t result;
138-
asm("bextrq %2, %1, %0" : "=r"(result) : "r"(value), "r"(pattern));
139-
return result;
140-
#else
141116
return _bextr_u64(value, start, length);
142-
#endif
143117
}
144118

145119
static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) {
146-
#if defined(__GNUC__)
147-
// GCC and Clang won't inline the intrinsics.
148-
const uint64_t index64 = index;
149-
uint64_t result;
150-
asm("bzhiq %2, %1, %0" : "=r"(result) : "r"(value), "r"(index64));
151-
return result;
152-
#else
153120
return _bzhi_u64(value, index);
154-
#endif
155121
}
156122
};
157123
#endif

0 commit comments

Comments
 (0)