Skip to content

Commit 593303b

Browse files
committed
Add MSVC intrinsics to ARM core, fix incorrect carry flag in fallbacks and replace unnecessary 64-bit math
1 parent 7c2749f commit 593303b

File tree

1 file changed

+100
-30
lines changed

1 file changed

+100
-30
lines changed

core/arm/armcpu.c

Lines changed: 100 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
#include <assert.h>
88
#include <stdlib.h>
99
#include <string.h>
10+
#if defined(_MSC_VER) && !defined(__clang__)
11+
#include <intrin.h>
12+
#endif
1013

1114
#ifndef __has_builtin
1215
# define __has_builtin(builtin) 0
@@ -20,9 +23,15 @@
2023
# define DEBUG_BREAK asm("int3")
2124
#endif
2225

26+
#if (defined(__GNUC__) || defined(__clang__))
27+
# define EXTENDED_ASM 1
28+
#else
29+
# define EXTENDED_ASM 0
30+
#endif
31+
2332
#if defined(__GCC_ASM_FLAG_OUTPUTS__) && \
2433
(defined(__i386) || defined(__x86_64__) || \
25-
defined(_M_IX86) || defined(_M_IX64))
34+
defined(_M_IX86) || defined(_M_X64))
2635
# define FLAGS_FROM_EXTENDED_X86_ASM 1
2736
#else
2837
# define FLAGS_FROM_EXTENDED_X86_ASM 0
@@ -34,6 +43,12 @@
3443
# define FLAGS_FROM_OVERFLOW_BUILTINS 0
3544
#endif
3645

46+
#if _MSC_VER >= 1937 && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
47+
# define FLAGS_FROM_MSVC_INTRINSICS 1
48+
#else
49+
# define FLAGS_FROM_MSVC_INTRINSICS 0
50+
#endif
51+
3752
#if __has_builtin(__builtin_constant_p) || __GNUC__ >= 3 // Not sure, so conservative
3853
# define HAVE_BUILTIN_CONSTANT_P 1
3954
#else
@@ -43,10 +58,21 @@
4358
static uint8_t bitcount9(uint32_t x) {
4459
#if __has_builtin(__builtin_popcount) || __GNUC__ >= 4
4560
return __builtin_popcount(x & 0777);
61+
#elif UINT32_MAX >= UINTPTR_MAX
62+
uint32_t res = (x &= 0777), mask = UINT32_C(0x11111111);
63+
res *= UINT32_C(0001001001001);
64+
res >>= 3;
65+
# if EXTENDED_ASM && (defined(__i386__) || defined(_M_IX86))
66+
__asm__("andl\t%1, %0\nimull\t%1, %0" : "+r"(res) : "r"(mask) : "cc");
67+
# else
68+
res &= mask;
69+
res *= mask;
70+
#endif
71+
return (res >> 28) + (x >> 8);
4672
#else
4773
uint64_t res = x & 0777, mask = UINT64_C(0x1111111111111111);
4874
res *= UINT64_C(0001001001001);
49-
# if defined(__x86_64__) || defined(_M_IX64)
75+
# if EXTENDED_ASM && (defined(__x86_64__) || defined(_M_X64))
5076
__asm__("andq\t%1, %0\nimulq\t%1, %0" : "+r"(res) : "r"(mask) : "cc");
5177
# else
5278
res &= mask;
@@ -60,17 +86,24 @@ static uint8_t lowestsetbit32(uint32_t x) {
6086
assert(x && "invalid argument");
6187
#if __has_builtin(__builtin_ctz) || __GNUC__ >= 4
6288
return __builtin_ctz(x);
89+
#elif defined(_MSC_VER) && !defined(__clang__)
90+
unsigned long index;
91+
_BitScanForward(&index, x);
92+
return index;
6393
#else
94+
# if EXTENDED_ASM && \
95+
(defined(__i386) || defined(__x86_64__) || \
96+
defined(_M_IX86) || defined(_M_X64))
6497
uint32_t res;
65-
# if defined(__i386) || defined(__x86_64__) || \
66-
defined(_M_IX86) || defined(_M_IX64)
6798
__asm__("bsfl\t%1, %0" : "+r"(res) : "r"(x) : "cc");
6899
# else
69-
res = 0;
70-
while (!(x & 1)) {
71-
x >>= 1;
72-
++res;
73-
}
100+
uint8_t res = 0;
101+
x &= -x;
102+
if (x & UINT32_C(0xAAAAAAAA)) res += 1;
103+
if (x & UINT32_C(0xCCCCCCCC)) res += 2;
104+
if (x & UINT32_C(0xF0F0F0F0)) res += 4;
105+
if (x & UINT32_C(0xFF00FF00)) res += 8;
106+
if (x & UINT32_C(0xFFFF0000)) res += 16;
74107
# endif
75108
return res;
76109
#endif
@@ -209,13 +242,17 @@ static uint32_t arm_negs(arm_cpu_t *cpu, uint32_t x) {
209242
#elif FLAGS_FROM_OVERFLOW_BUILTINS
210243
int32_t res;
211244
cpu->v = __builtin_sub_overflow(0, (int32_t)x, &res);
212-
cpu->c = x;
213-
return arm_movs(cpu, -x);
245+
cpu->c = !res;
246+
return arm_movs(cpu, res);
247+
#elif FLAGS_FROM_MSVC_INTRINSICS
248+
int32_t res;
249+
cpu->v = _sub_overflow_i32(0, 0, x, &res);
250+
cpu->c = !res;
251+
return arm_movs(cpu, res);
214252
#else
215-
int64_t res = UINT64_C(0) - (int32_t)x;
216-
cpu->v = res != (int32_t)res;
217-
cpu->c = (uint32_t)res <= 0;
218-
//cpu->c = 0 >= x;
253+
uint32_t res = -x;
254+
cpu->v = (x & res) >> 31;
255+
cpu->c = !res;
219256
return arm_movs(cpu, res);
220257
#endif
221258
}
@@ -229,10 +266,18 @@ static uint32_t arm_adds(arm_cpu_t *cpu, uint32_t x, uint32_t y) {
229266
cpu->v = __builtin_add_overflow((int32_t)x, (int32_t)y, &res);
230267
cpu->c = __builtin_add_overflow(x, y, &x);
231268
return arm_movs(cpu, x);
269+
#elif FLAGS_FROM_MSVC_INTRINSICS
270+
int32_t res;
271+
cpu->v = _add_overflow_i32(0, x, y, &res);
272+
cpu->c = _addcarry_u32(0, x, y, &x);
273+
return arm_movs(cpu, x);
232274
#else
233-
int64_t res = (int64_t)(int32_t)x + (int32_t)y;
234-
cpu->v = res != (int32_t)res;
235-
cpu->c = (uint32_t)res < x;
275+
uint32_t res = x + y;
276+
flags->v = ((res ^ x) & (res ^ y)) >> 31;
277+
flags->c = res < x;
278+
//int64_t res = (int64_t)(int32_t)x + (int32_t)y;
279+
//cpu->v = res != (int32_t)res;
280+
//cpu->c = (uint32_t)res < x;
236281
//cpu->c = x > ~y;
237282
return arm_movs(cpu, res);
238283
#endif
@@ -247,10 +292,18 @@ static uint32_t arm_subs(arm_cpu_t *cpu, uint32_t x, uint32_t y) {
247292
cpu->v = __builtin_sub_overflow((int32_t)x, (int32_t)y, &res);
248293
cpu->c = !__builtin_sub_overflow(x, y, &x);
249294
return arm_movs(cpu, x);
295+
#elif FLAGS_FROM_MSVC_INTRINSICS
296+
int32_t res;
297+
cpu->v = _sub_overflow_i32(0, x, y, &res);
298+
cpu->c = !_subborrow_u32(0, x, y, &x);
299+
return arm_movs(cpu, x);
250300
#else
251-
int64_t res = (int64_t)(int32_t)x - (int32_t)y;
252-
cpu->v = res != (int32_t)res;
253-
cpu->c = (uint32_t)res <= x;
301+
uint32_t res = x - y;
302+
cpu->v = ((x ^ y) & (res ^ x)) >> 31;
303+
cpu->c = res <= x;
304+
//int64_t res = (int64_t)(int32_t)x - (int32_t)y;
305+
//cpu->v = res != (int32_t)res;
306+
//cpu->c = (uint32_t)res <= x;
254307
//cpu->c = x >= y;
255308
return arm_movs(cpu, res);
256309
#endif
@@ -268,10 +321,20 @@ static uint32_t arm_adcs(arm_cpu_t *cpu, uint32_t x, uint32_t y) {
268321
cpu->c = __builtin_add_overflow(x, y, &x);
269322
cpu->c |= __builtin_add_overflow(x, carry, &x);
270323
return arm_movs(cpu, x);
324+
#elif FLAGS_FROM_MSVC_INTRINSICS
325+
bool carry = cpu->c;
326+
int32_t res;
327+
cpu->v = _add_overflow_i32(carry, x, y, &res);
328+
cpu->c = _addcarry_u32(carry, x, y, &x);
329+
return arm_movs(cpu, x);
271330
#else
272-
int64_t res = (uint64_t)(int32_t)x + (int32_t)y + cpu->c;
273-
cpu->v = res != (int32_t)res;
274-
cpu->c = ((uint64_t)x + y + cpu->c) >> 32;
331+
uint32_t res = x + y + cpu->c;
332+
uint32_t carries = (x | y) ^ ((x ^ y) & res);
333+
cpu->c = carries >> 31;
334+
cpu->v = cpu->c ^ (carries >> 30 & 1);
335+
//int64_t res = (uint64_t)(int32_t)x + (int32_t)y + cpu->c;
336+
//cpu->v = res != (int32_t)res;
337+
//cpu->c = ((uint64_t)x + y + cpu->c) >> 32;
275338
return arm_movs(cpu, res);
276339
#endif
277340
}
@@ -285,14 +348,21 @@ static uint32_t arm_sbcs(arm_cpu_t *cpu, uint32_t x, uint32_t y) {
285348
int32_t res;
286349
cpu->v = __builtin_sub_overflow(x, y, &res);
287350
cpu->v |= __builtin_sub_overflow(res, borrow, &res);
288-
cpu->c = __builtin_sub_overflow(x, y, &x);
289-
cpu->c |= __builtin_sub_overflow(x, borrow, &x);
351+
cpu->c = !__builtin_sub_overflow(x, y, &x);
352+
cpu->c &= !__builtin_sub_overflow(x, borrow, &x);
353+
return arm_movs(cpu, x);
354+
#elif FLAGS_FROM_MSVC_INTRINSICS
355+
bool borrow = !cpu->c;
356+
int32_t res;
357+
cpu->v = _sub_overflow_i32(borrow, x, y, &res);
358+
cpu->c = !_subborrow_u32(borrow, x, y, &x);
290359
return arm_movs(cpu, x);
291360
#else
292-
int64_t res = (uint64_t)(int32_t)x - (int32_t)y - !cpu->c;
293-
cpu->v = res != (int32_t)res;
294-
cpu->c = ((uint64_t)x - y - !cpu->c) >> 32;
295-
return arm_movs(cpu, res);
361+
return arm_adcs(cpu, x, ~y);
362+
//int64_t res = (uint64_t)(int32_t)x - (int32_t)y - !cpu->c;
363+
//cpu->v = res != (int32_t)res;
364+
//cpu->c = !(((uint64_t)x - y - !cpu->c) >> 32);
365+
//return arm_movs(cpu, res);
296366
#endif
297367
}
298368

0 commit comments

Comments
 (0)