@@ -124,27 +124,26 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
124124bc_num _bc_do_sub (bc_num n1 , bc_num n2 , size_t scale_min )
125125{
126126 bc_num diff ;
127- size_t diff_scale , diff_len ;
128- size_t min_scale , min_len ;
129- size_t borrow , count ;
127+ /* The caller is guaranteed that n1 is always large. */
128+ size_t diff_len = EXPECTED (n1 -> n_len >= n2 -> n_len ) ? n1 -> n_len : n2 -> n_len ;
129+ size_t diff_scale = MAX (n1 -> n_scale , n2 -> n_scale );
130+ /* Same condition as EXPECTED before, but using EXPECTED again will make it slower. */
131+ size_t min_len = n1 -> n_len >= n2 -> n_len ? n2 -> n_len : n1 -> n_len ;
132+ size_t min_scale = MIN (n1 -> n_scale , n2 -> n_scale );
133+ size_t min_bytes = min_len + min_scale ;
134+ size_t borrow = 0 ;
135+ size_t count ;
130136 int val ;
131137 char * n1ptr , * n2ptr , * diffptr ;
132138
133139 /* Allocate temporary storage. */
134- diff_len = MAX (n1 -> n_len , n2 -> n_len );
135- diff_scale = MAX (n1 -> n_scale , n2 -> n_scale );
136- min_len = MIN (n1 -> n_len , n2 -> n_len );
137- min_scale = MIN (n1 -> n_scale , n2 -> n_scale );
138140 diff = bc_new_num (diff_len , MAX (diff_scale , scale_min ));
139141
140142 /* Initialize the subtract. */
141143 n1ptr = (char * ) (n1 -> n_value + n1 -> n_len + n1 -> n_scale - 1 );
142144 n2ptr = (char * ) (n2 -> n_value + n2 -> n_len + n2 -> n_scale - 1 );
143145 diffptr = (char * ) (diff -> n_value + diff_len + diff_scale - 1 );
144146
145- /* Subtract the numbers. */
146- borrow = 0 ;
147-
148147 /* Take care of the longer scaled number. */
149148 if (n1 -> n_scale != min_scale ) {
150149 /* n1 has the longer scale */
@@ -166,7 +165,59 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
166165 }
167166
168167 /* Now do the equal length scale and integer parts. */
169- for (count = 0 ; count < min_len + min_scale ; count ++ ) {
168+ count = 0 ;
169+ /* Uses SIMD to perform calculations at high speed. */
170+ if (min_bytes >= sizeof (BC_UINT_T )) {
171+ diffptr ++ ;
172+ n1ptr ++ ;
173+ n2ptr ++ ;
174+ while (count + sizeof (BC_UINT_T ) <= min_bytes ) {
175+ diffptr -= sizeof (BC_UINT_T );
176+ n1ptr -= sizeof (BC_UINT_T );
177+ n2ptr -= sizeof (BC_UINT_T );
178+
179+ BC_UINT_T n1bytes ;
180+ BC_UINT_T n2bytes ;
181+ memcpy (& n1bytes , n1ptr , sizeof (n1bytes ));
182+ memcpy (& n2bytes , n2ptr , sizeof (n2bytes ));
183+
184+ #if BC_LITTLE_ENDIAN
185+ /* Little endian requires changing the order of bytes. */
186+ n1bytes = BC_BSWAP (n1bytes );
187+ n2bytes = BC_BSWAP (n2bytes );
188+ #endif
189+
190+ n1bytes -= n2bytes + borrow ;
191+ /* If the most significant bit is 1, a carry down has occurred. */
192+ bool tmp_borrow = n1bytes & ((BC_UINT_T ) 1 << (8 * sizeof (BC_UINT_T ) - 1 ));
193+
194+ /*
195+ * Check the most significant bit of each of the bytes, and if it is 1, a carry down has
196+ * occurred. When carrying down occurs, due to the difference between decimal and hexadecimal
197+ * numbers, an extra 6 is added to the lower 4 bits.
198+ * Therefore, for a byte that has been carried down, set all the upper 4 bits to 0 and subtract
199+ * 6 from the lower 4 bits to adjust it to the correct value as a decimal number.
200+ */
201+ BC_UINT_T borrow_mask = ((n1bytes & SWAR_REPEAT (0x80 )) >> 7 ) * 0x06 ;
202+ n1bytes = (n1bytes & SWAR_REPEAT (0x0F )) - borrow_mask ;
203+
204+ #if BC_LITTLE_ENDIAN
205+ /* Little endian requires changing the order of bytes back. */
206+ n1bytes = BC_BSWAP (n1bytes );
207+ #endif
208+
209+ memcpy (diffptr , & n1bytes , sizeof (n1bytes ));
210+
211+ borrow = tmp_borrow ;
212+ count += sizeof (BC_UINT_T );
213+ }
214+ diffptr -- ;
215+ n1ptr -- ;
216+ n2ptr -- ;
217+ }
218+
219+ /* Calculate the remaining bytes that are less than the size of BC_UINT_T using a normal loop. */
220+ for (; count < min_bytes ; count ++ ) {
170221 val = * n1ptr -- - * n2ptr -- - borrow ;
171222 if (val < 0 ) {
172223 val += BASE ;
0 commit comments