Skip to content

Commit 4b8fa11

Browse files
committed
x86-64: word-at-a-time: improve byte count calculations
This switches x86-64 over to using 'tzcount' instead of the integer multiply trick to turn the bytemask information into actual byte counts. We even had a comment saying that a fast bit count instruction is better than a multiply, but x86 bit counting has traditionally been "questionably fast", and so avoiding it was the right thing back in the days. Now, on any half-way modern core, using bit counting is cheaper and smaller than the large constant multiply, so let's just switch over. Note that as part of switching over to counting bits, we also do it at a different point. We used to create the byte count from the final byte mask, but once you use the 'tzcount' instruction (aka 'bsf' on older CPU's), you can actually count the leading zeroes using a value we have available earlier. In fact, we can just use the very first mask of bits that tells us whether we have any zero bytes at all. The zero bytes in the word will have the high bit set, so just doing 'tzcount' on that value and dividing by 8 will give the number of bytes that precede the first NUL character, which is exactly what we want. Note also that the input value to the tzcount is by definition not zero, since that is the condition that we already used to check the whole "do we have any zero bytes at all". So we don't need to worry about the legacy instruction behavior of pre-lzcount days when 'bsf' didn't have a result for zero input. The 32-bit code continues to use the bimple bit op trick that is faster even on newer cores, but particularly on the older 32-bit-only ones. Signed-off-by: Linus Torvalds <[email protected]>
1 parent 6ba59ff commit 4b8fa11

File tree

1 file changed

+23
-34
lines changed

1 file changed

+23
-34
lines changed

arch/x86/include/asm/word-at-a-time.h

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,45 +5,12 @@
55
#include <linux/bitops.h>
66
#include <linux/wordpart.h>
77

8-
/*
9-
* This is largely generic for little-endian machines, but the
10-
* optimal byte mask counting is probably going to be something
11-
* that is architecture-specific. If you have a reliably fast
12-
* bit count instruction, that might be better than the multiply
13-
* and shift, for example.
14-
*/
158
struct word_at_a_time {
169
const unsigned long one_bits, high_bits;
1710
};
1811

1912
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
2013

21-
#ifdef CONFIG_64BIT
22-
23-
/*
24-
* Jan Achrenius on G+: microoptimized version of
25-
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
26-
* that works for the bytemasks without having to
27-
* mask them first.
28-
*/
29-
static inline long count_masked_bytes(unsigned long mask)
30-
{
31-
return mask*0x0001020304050608ul >> 56;
32-
}
33-
34-
#else /* 32-bit case */
35-
36-
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
37-
static inline long count_masked_bytes(long mask)
38-
{
39-
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
40-
long a = (0x0ff0001+mask) >> 23;
41-
/* Fix the 1 for 00 case */
42-
return a & mask;
43-
}
44-
45-
#endif
46-
4714
/* Return nonzero if it has a zero */
4815
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
4916
{
@@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
5724
return bits;
5825
}
5926

27+
#ifdef CONFIG_64BIT
28+
29+
/* Keep the initial has_zero() value for both bitmask and size calc */
30+
#define create_zero_mask(bits) (bits)
31+
32+
static inline unsigned long zero_bytemask(unsigned long bits)
33+
{
34+
bits = (bits - 1) & ~bits;
35+
return bits >> 7;
36+
}
37+
38+
#define find_zero(bits) (__ffs(bits) >> 3)
39+
40+
#else
41+
42+
/* Create the final mask for both bytemask and size */
6043
static inline unsigned long create_zero_mask(unsigned long bits)
6144
{
6245
bits = (bits - 1) & ~bits;
@@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
6649
/* The mask we created is directly usable as a bytemask */
6750
#define zero_bytemask(mask) (mask)
6851

52+
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
6953
static inline unsigned long find_zero(unsigned long mask)
7054
{
71-
return count_masked_bytes(mask);
55+
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
56+
long a = (0x0ff0001+mask) >> 23;
57+
/* Fix the 1 for 00 case */
58+
return a & mask;
7259
}
7360

61+
#endif
62+
7463
/*
7564
* Load an unaligned word from kernel space.
7665
*

0 commit comments

Comments
 (0)