Refactor BCMath _bc_do_sub (#14132)

SakiTakamachi · web-flow · commit 02732007f700 · 2024-05-07T11:39:31.000+09:00
_bc_do_sub now uses SIMD to perform calculations at high speed.

Moved the macros used for SIMD to `private.h`, and added some constants
and macros.
diff --git a/ext/bcmath/libbcmath/src/convert.c b/ext/bcmath/libbcmath/src/convert.c
@@ -16,16 +16,11 @@
 
 #include "bcmath.h"
 #include "convert.h"
+#include "private.h"
 #ifdef __SSE2__
 # include <emmintrin.h>
 #endif
 
-/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
-#define SWAR_ONES (~((size_t) 0) / 0xFF)
-/* This repeats a byte `x` into an entire 32/64-bit word.
- * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
-#define SWAR_REPEAT(x) (SWAR_ONES * (x))
-
 static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source, const char *source_end, unsigned char shift, bool add)
 {
 	size_t bulk_shift = SWAR_REPEAT(shift);
diff --git a/ext/bcmath/libbcmath/src/doaddsub.c b/ext/bcmath/libbcmath/src/doaddsub.c
@@ -124,27 +124,26 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-	size_t diff_scale, diff_len;
-	size_t min_scale, min_len;
-	size_t borrow, count;
+	/* The caller is guaranteed that n1 is always large. */
+	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
+	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
+	/* Same condition as EXPECTED before, but using EXPECTED again will make it slower. */
+	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
+	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
+	size_t min_bytes = min_len + min_scale;
+	size_t borrow = 0;
+	size_t count;
 	int val;
 	char *n1ptr, *n2ptr, *diffptr;
 
 	/* Allocate temporary storage. */
-	diff_len = MAX(n1->n_len, n2->n_len);
-	diff_scale = MAX(n1->n_scale, n2->n_scale);
-	min_len = MIN(n1->n_len, n2->n_len);
-	min_scale = MIN(n1->n_scale, n2->n_scale);
 	diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
 
 	/* Initialize the subtract. */
 	n1ptr = (char *) (n1->n_value + n1->n_len + n1->n_scale - 1);
 	n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
 	diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
 
-	/* Subtract the numbers. */
-	borrow = 0;
-
 	/* Take care of the longer scaled number. */
 	if (n1->n_scale != min_scale) {
 		/* n1 has the longer scale */
@@ -166,7 +165,59 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	}
 
 	/* Now do the equal length scale and integer parts. */
-	for (count = 0; count < min_len + min_scale; count++) {
+	count = 0;
+	/* Uses SIMD to perform calculations at high speed. */
+	if (min_bytes >= sizeof(BC_UINT_T)) {
+		diffptr++;
+		n1ptr++;
+		n2ptr++;
+		while (count + sizeof(BC_UINT_T) <= min_bytes) {
+			diffptr -= sizeof(BC_UINT_T);
+			n1ptr -= sizeof(BC_UINT_T);
+			n2ptr -= sizeof(BC_UINT_T);
+
+			BC_UINT_T n1bytes;
+			BC_UINT_T n2bytes;
+			memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
+			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
+
+#if BC_LITTLE_ENDIAN
+			/* Little endian requires changing the order of bytes. */
+			n1bytes = BC_BSWAP(n1bytes);
+			n2bytes = BC_BSWAP(n2bytes);
+#endif
+
+			n1bytes -= n2bytes + borrow;
+			/* If the most significant bit is 1, a carry down has occurred. */
+			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
+
+			/*
+			 * Check the most significant bit of each of the bytes, and if it is 1, a carry down has
+			 * occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
+			 * numbers, an extra 6 is added to the lower 4 bits.
+			 * Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract
+			 * 6 from the lower 4 bits to adjust it to the correct value as a decimal number.
+			 */
+			BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT(0x80)) >> 7) * 0x06;
+			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
+
+#if BC_LITTLE_ENDIAN
+			/* Little endian requires changing the order of bytes back. */
+			n1bytes = BC_BSWAP(n1bytes);
+#endif
+
+			memcpy(diffptr, &n1bytes, sizeof(n1bytes));
+
+			borrow = tmp_borrow;
+			count += sizeof(BC_UINT_T);
+		}
+		diffptr--;
+		n1ptr--;
+		n2ptr--;
+	}
+
+	/* Calculate the remaining bytes that are less than the size of BC_UINT_T using a normal loop. */
+	for (; count < min_bytes; count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;
diff --git a/ext/bcmath/libbcmath/src/private.h b/ext/bcmath/libbcmath/src/private.h
@@ -34,6 +34,68 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
+#define SWAR_ONES (~((size_t) 0) / 0xFF)
+/* This repeats a byte `x` into an entire 32/64-bit word.
+ * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
+#define SWAR_REPEAT(x) (SWAR_ONES * (x))
+
+/* Bytes swap */
+#if defined(_MSC_VER)
+#  include <stdlib.h>
+#  define BSWAP32(u) _byteswap_ulong(u)
+#  define BSWAP64(u) _byteswap_uint64(u)
+#else
+#  ifdef __has_builtin
+#    if __has_builtin(__builtin_bswap32)
+#      define BSWAP32(u) __builtin_bswap32(u)
+#    endif // __has_builtin(__builtin_bswap32)
+#    if __has_builtin(__builtin_bswap64)
+#      define BSWAP64(u) __builtin_bswap64(u)
+#    endif // __has_builtin(__builtin_bswap64)
+#  elif defined(__GNUC__)
+#    define BSWAP32(u) __builtin_bswap32(u)
+#    define BSWAP64(u) __builtin_bswap64(u)
+#  endif // __has_builtin
+#endif // defined(_MSC_VER)
+#ifndef BSWAP32
+inline uint32_t BSWAP32(uint32_t u)
+{
+  return (((u & 0xff000000) >> 24)
+          | ((u & 0x00ff0000) >>  8)
+          | ((u & 0x0000ff00) <<  8)
+          | ((u & 0x000000ff) << 24));
+}
+#endif
+#ifndef BSWAP64
+inline uint64_t BSWAP64(uint64_t u)
+{
+   return (((u & 0xff00000000000000ULL) >> 56)
+          | ((u & 0x00ff000000000000ULL) >> 40)
+          | ((u & 0x0000ff0000000000ULL) >> 24)
+          | ((u & 0x000000ff00000000ULL) >>  8)
+          | ((u & 0x00000000ff000000ULL) <<  8)
+          | ((u & 0x0000000000ff0000ULL) << 24)
+          | ((u & 0x000000000000ff00ULL) << 40)
+          | ((u & 0x00000000000000ffULL) << 56));
+}
+#endif
+
+#if SIZEOF_SIZE_T >= 8
+#define BC_BSWAP(u) BSWAP64(u)
+#define BC_UINT_T uint64_t
+#else
+#define BC_BSWAP(u) BSWAP32(u)
+#define BC_UINT_T uint32_t
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define BC_LITTLE_ENDIAN 0
+#else
+#define BC_LITTLE_ENDIAN 1
+#endif
+
+
 /* routines */
 int _bc_do_compare (bc_num n1, bc_num n2, bool use_sign);
 bc_num _bc_do_add (bc_num n1, bc_num n2, size_t scale_min);
diff --git a/ext/bcmath/libbcmath/src/str2num.c b/ext/bcmath/libbcmath/src/str2num.c
@@ -31,6 +31,7 @@
 
 #include "bcmath.h"
 #include "convert.h"
+#include "private.h"
 #include <stdbool.h>
 #include <stddef.h>
 #ifdef __SSE2__