@@ -38,6 +38,147 @@ struct min_heap_callbacks {
3838 void (* swp )(void * lhs , void * rhs , void * args );
3939};
4040
41+ /**
42+ * is_aligned - is this pointer & size okay for word-wide copying?
43+ * @base: pointer to data
44+ * @size: size of each element
45+ * @align: required alignment (typically 4 or 8)
46+ *
47+ * Returns true if elements can be copied using word loads and stores.
48+ * The size must be a multiple of the alignment, and the base address must
49+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
50+ *
51+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
52+ * to "if ((a | b) & mask)", so we do that by hand.
53+ */
54+ __attribute_const__ __always_inline
55+ static bool is_aligned (const void * base , size_t size , unsigned char align )
56+ {
57+ unsigned char lsbits = (unsigned char )size ;
58+
59+ (void )base ;
60+ #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
61+ lsbits |= (unsigned char )(uintptr_t )base ;
62+ #endif
63+ return (lsbits & (align - 1 )) == 0 ;
64+ }
65+
66+ /**
67+ * swap_words_32 - swap two elements in 32-bit chunks
68+ * @a: pointer to the first element to swap
69+ * @b: pointer to the second element to swap
70+ * @n: element size (must be a multiple of 4)
71+ *
72+ * Exchange the two objects in memory. This exploits base+index addressing,
73+ * which basically all CPUs have, to minimize loop overhead computations.
74+ *
75+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
76+ * bottom of the loop, even though the zero flag is still valid from the
77+ * subtract (since the intervening mov instructions don't alter the flags).
78+ * Gcc 8.1.0 doesn't have that problem.
79+ */
80+ static __always_inline
81+ void swap_words_32 (void * a , void * b , size_t n )
82+ {
83+ do {
84+ u32 t = * (u32 * )(a + (n -= 4 ));
85+ * (u32 * )(a + n ) = * (u32 * )(b + n );
86+ * (u32 * )(b + n ) = t ;
87+ } while (n );
88+ }
89+
90+ /**
91+ * swap_words_64 - swap two elements in 64-bit chunks
92+ * @a: pointer to the first element to swap
93+ * @b: pointer to the second element to swap
94+ * @n: element size (must be a multiple of 8)
95+ *
96+ * Exchange the two objects in memory. This exploits base+index
97+ * addressing, which basically all CPUs have, to minimize loop overhead
98+ * computations.
99+ *
100+ * We'd like to use 64-bit loads if possible. If they're not, emulating
101+ * one requires base+index+4 addressing which x86 has but most other
102+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
103+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
104+ * x32 ABI). Are there any cases the kernel needs to worry about?
105+ */
106+ static __always_inline
107+ void swap_words_64 (void * a , void * b , size_t n )
108+ {
109+ do {
110+ #ifdef CONFIG_64BIT
111+ u64 t = * (u64 * )(a + (n -= 8 ));
112+ * (u64 * )(a + n ) = * (u64 * )(b + n );
113+ * (u64 * )(b + n ) = t ;
114+ #else
115+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
116+ u32 t = * (u32 * )(a + (n -= 4 ));
117+ * (u32 * )(a + n ) = * (u32 * )(b + n );
118+ * (u32 * )(b + n ) = t ;
119+
120+ t = * (u32 * )(a + (n -= 4 ));
121+ * (u32 * )(a + n ) = * (u32 * )(b + n );
122+ * (u32 * )(b + n ) = t ;
123+ #endif
124+ } while (n );
125+ }
126+
127+ /**
128+ * swap_bytes - swap two elements a byte at a time
129+ * @a: pointer to the first element to swap
130+ * @b: pointer to the second element to swap
131+ * @n: element size
132+ *
133+ * This is the fallback if alignment doesn't allow using larger chunks.
134+ */
135+ static __always_inline
136+ void swap_bytes (void * a , void * b , size_t n )
137+ {
138+ do {
139+ char t = ((char * )a )[-- n ];
140+ ((char * )a )[n ] = ((char * )b )[n ];
141+ ((char * )b )[n ] = t ;
142+ } while (n );
143+ }
144+
145+ /*
146+ * The values are arbitrary as long as they can't be confused with
147+ * a pointer, but small integers make for the smallest compare
148+ * instructions.
149+ */
150+ #define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
151+ #define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
152+ #define SWAP_BYTES ((void (*)(void *, void *, void *))2)
153+
154+ /*
155+ * Selects the appropriate swap function based on the element size.
156+ */
157+ static __always_inline
158+ void * select_swap_func (const void * base , size_t size )
159+ {
160+ if (is_aligned (base , size , 8 ))
161+ return SWAP_WORDS_64 ;
162+ else if (is_aligned (base , size , 4 ))
163+ return SWAP_WORDS_32 ;
164+ else
165+ return SWAP_BYTES ;
166+ }
167+
168+ static __always_inline
169+ void do_swap (void * a , void * b , size_t size , void (* swap_func )(void * lhs , void * rhs , void * args ),
170+ void * priv )
171+ {
172+ if (swap_func == SWAP_WORDS_64 )
173+ swap_words_64 (a , b , size );
174+ else if (swap_func == SWAP_WORDS_32 )
175+ swap_words_32 (a , b , size );
176+ else if (swap_func == SWAP_BYTES )
177+ swap_bytes (a , b , size );
178+ else
179+ swap_func (a , b , priv );
180+ }
181+
41182/**
42183 * parent - given the offset of the child, find the offset of the parent.
43184 * @i: the offset of the heap element whose parent is sought. Non-zero.
@@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
106247{
107248 const unsigned long lsbit = elem_size & - elem_size ;
108249 void * data = heap -> data ;
250+ void (* swp )(void * lhs , void * rhs , void * args ) = func -> swp ;
109251 /* pre-scale counters for performance */
110252 size_t a = pos * elem_size ;
111253 size_t b , c , d ;
112254 size_t n = heap -> nr * elem_size ;
113255
256+ if (!swp )
257+ swp = select_swap_func (data , elem_size );
258+
114259 /* Find the sift-down path all the way to the leaves. */
115260 for (b = a ; c = 2 * b + elem_size , (d = c + elem_size ) < n ;)
116261 b = func -> less (data + c , data + d , args ) ? c : d ;
@@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
127272 c = b ;
128273 while (b != a ) {
129274 b = parent (b , lsbit , elem_size );
130- func -> swp (data + b , data + c , args );
275+ do_swap (data + b , data + c , elem_size , swp , args );
131276 }
132277}
133278
@@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx
142287{
143288 const unsigned long lsbit = elem_size & - elem_size ;
144289 void * data = heap -> data ;
290+ void (* swp )(void * lhs , void * rhs , void * args ) = func -> swp ;
145291 /* pre-scale counters for performance */
146292 size_t a = idx * elem_size , b ;
147293
294+ if (!swp )
295+ swp = select_swap_func (data , elem_size );
296+
148297 while (a ) {
149298 b = parent (a , lsbit , elem_size );
150299 if (func -> less (data + b , data + a , args ))
151300 break ;
152- func -> swp (data + a , data + b , args );
301+ do_swap (data + a , data + b , elem_size , swp , args );
153302 a = b ;
154303 }
155304}
@@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
242391 const struct min_heap_callbacks * func , void * args )
243392{
244393 void * data = heap -> data ;
394+ void (* swp )(void * lhs , void * rhs , void * args ) = func -> swp ;
245395
246396 if (WARN_ONCE (heap -> nr <= 0 , "Popping an empty heap" ))
247397 return false;
248398
399+ if (!swp )
400+ swp = select_swap_func (data , elem_size );
401+
249402 /* Place last element at the root (position 0) and then sift down. */
250403 heap -> nr -- ;
251404 if (idx == heap -> nr )
252405 return true;
253- func -> swp (data + (idx * elem_size ), data + (heap -> nr * elem_size ), args );
406+ do_swap (data + (idx * elem_size ), data + (heap -> nr * elem_size ), elem_size , swp , args );
254407 __min_heap_sift_up_inline (heap , elem_size , idx , func , args );
255408 __min_heap_sift_down_inline (heap , idx , elem_size , func , args );
256409
0 commit comments