2828#include <stdlib.h>
2929#include <string.h>
3030
31- #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
32- #include <x86intrin.h>
33- #endif
34-
3531/* Standard C fallbacks */
3632static void * mem_set16_c (void * dest , int c , visual_size_t n );
3733static void * mem_set32_c (void * dest , int c , visual_size_t n );
3834static void * mem_copy_pitch_c (void * dest , const void * src , int pitch1 , int pitch2 , int width , int rows );
3935
40- /* x86 SIMD optimized versions */
41- #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
42- static void * mem_set16_simd_x86 (void * dest , int c , visual_size_t n );
43-
44- static void * mem_set32_simd_x86 (void * dest , int c , visual_size_t n );
45- #endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
46-
47-
4836/* Optimal performance functions set by visual_mem_initialize(). */
4937VisMemCopyFunc visual_mem_copy = memcpy ;
5038VisMemCopyPitchFunc visual_mem_copy_pitch = mem_copy_pitch_c ;
@@ -56,15 +44,7 @@ VisMemSet32Func visual_mem_set32 = mem_set32_c;
5644
5745void visual_mem_initialize ()
5846{
59- /* Arranged from slow to fast, so the slower version gets overloaded
60- * every time */
61-
62- #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
63- if (visual_cpu_has_sse ()) {
64- visual_mem_set16 = mem_set16_simd_x86 ;
65- visual_mem_set32 = mem_set32_simd_x86 ;
66- }
67- #endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
47+ /* Nothing to do */
6848}
6949
7050void * visual_mem_malloc (visual_size_t nbytes )
@@ -107,40 +87,63 @@ void visual_mem_free (void *ptr)
10787 free (ptr );
10888}
10989
110- /* Memset functions, 2 byte memset */
11190static void * mem_set16_c (void * dest , int c , visual_size_t n )
11291{
113- uint32_t * d = dest ;
114- uint16_t * dc = dest ;
115- uint32_t setflag32 = (c & 0xffff ) | ((c << 16 ) & 0xffff0000 );
116- uint16_t setflag16 = c & 0xffff ;
117-
118- while (n >= 2 ) {
119- * d ++ = setflag32 ;
120- n -= 2 ;
121- }
92+ uint64_t * u64_ptr = dest ;
93+
94+ uint16_t copy = c & 0xffff ;
95+ uint32_t copy_x2 = copy | (copy << 16 );
96+ uint64_t copy_x4 = copy_x2 | ((uint64_t )copy_x2 << 32 );
12297
123- dc = (uint16_t * ) d ;
98+ while (n >= 32 ) {
99+ u64_ptr [0 ] = copy_x4 ;
100+ u64_ptr [1 ] = copy_x4 ;
101+ u64_ptr [2 ] = copy_x4 ;
102+ u64_ptr [3 ] = copy_x4 ;
103+ u64_ptr [4 ] = copy_x4 ;
104+ u64_ptr [5 ] = copy_x4 ;
105+ u64_ptr [6 ] = copy_x4 ;
106+ u64_ptr [7 ] = copy_x4 ;
107+ n -= 32 ;
108+ u64_ptr += 8 ;
109+ }
124110
125- while (n -- )
126- * dc ++ = setflag16 ;
111+ uint16_t * u16_ptr = (uint16_t * ) u64_ptr ;
112+ while (n -- ) {
113+ * u16_ptr ++ = copy ;
114+ }
127115
128116 return dest ;
129117}
130118
131- /* Memset functions, 4 byte memset */
132119static void * mem_set32_c (void * dest , int c , visual_size_t n )
133120{
134- uint32_t * d = dest ;
135- uint32_t setflag32 = c ;
121+ uint64_t * u64_ptr = dest ;
136122
137- while (n -- )
138- * d ++ = setflag32 ;
123+ uint32_t copy = c ;
124+ uint64_t copy_x2 = ((uint64_t ) copy ) | ((uint64_t ) copy << 32 );
125+
126+ while (n >= 16 ) {
127+ u64_ptr [0 ] = copy_x2 ;
128+ u64_ptr [1 ] = copy_x2 ;
129+ u64_ptr [2 ] = copy_x2 ;
130+ u64_ptr [3 ] = copy_x2 ;
131+ u64_ptr [4 ] = copy_x2 ;
132+ u64_ptr [5 ] = copy_x2 ;
133+ u64_ptr [6 ] = copy_x2 ;
134+ u64_ptr [7 ] = copy_x2 ;
135+ n -= 16 ;
136+ u64_ptr += 8 ;
137+ }
138+
139+ uint32_t * u32_ptr = (uint32_t * ) u64_ptr ;
140+ while (n -- ) {
141+ * u32_ptr ++ = copy ;
142+ }
139143
140144 return dest ;
141145}
142146
143- /* Memcopy with pitch functions */
144147static void * mem_copy_pitch_c (void * dest , const void * src , int pitch1 , int pitch2 , int row_bytes , int rows )
145148{
146149 uint8_t * d = dest ;
@@ -156,77 +159,3 @@ static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitc
156159
157160 return dest ;
158161}
159-
160- #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
161-
162- static void * mem_set16_simd_x86 (void * dest , int c , visual_size_t n )
163- {
164- // FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
165-
166- const uint16_t copy = c & 0xffff ;
167- const uint32_t copy_2x = (copy & 0xffff ) | ((copy << 16 ) & 0xffff0000 );
168- const __m64 copy_4x = _mm_set_pi32 (copy_2x , copy_2x );
169-
170- __m64 * m64_ptr = (__m64 * ) dest ;
171-
172- // Copy 32 copies each iteration
173- while (n >= 32 ) {
174- _mm_stream_pi (m64_ptr , copy_4x );
175- _mm_stream_pi (m64_ptr + 1 , copy_4x );
176- _mm_stream_pi (m64_ptr + 2 , copy_4x );
177- _mm_stream_pi (m64_ptr + 3 , copy_4x );
178- _mm_stream_pi (m64_ptr + 4 , copy_4x );
179- _mm_stream_pi (m64_ptr + 5 , copy_4x );
180- _mm_stream_pi (m64_ptr + 6 , copy_4x );
181- _mm_stream_pi (m64_ptr + 7 , copy_4x );
182- m64_ptr += 8 ;
183- n -= 32 ;
184- }
185-
186- uint32_t * uint32_ptr = (uint32_t * ) m64_ptr ;
187-
188- while (n >= 2 ) {
189- * uint32_ptr ++ = copy_2x ;
190- n -= 2 ;
191- }
192-
193- uint16_t * uint16_ptr = (uint16_t * ) uint32_ptr ;
194- const uint16_t setflag16 = c & 0xffff ;
195-
196- while (n -- )
197- * uint16_ptr ++ = setflag16 ;
198-
199- return dest ;
200- }
201-
202- static void * mem_set32_simd_x86 (void * dest , int c , visual_size_t n )
203- {
204- // FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
205-
206- const uint32_t copy = c ;
207- const __m64 copy_2x = _mm_set_pi32 (copy , copy );
208-
209- __m64 * m64_ptr = (__m64 * ) dest ;
210-
211- // Copy 16 copies each iteration
212- while (n >= 16 ) {
213- _mm_stream_pi (m64_ptr , copy_2x );
214- _mm_stream_pi (m64_ptr + 1 , copy_2x );
215- _mm_stream_pi (m64_ptr + 2 , copy_2x );
216- _mm_stream_pi (m64_ptr + 3 , copy_2x );
217- _mm_stream_pi (m64_ptr + 4 , copy_2x );
218- _mm_stream_pi (m64_ptr + 5 , copy_2x );
219- _mm_stream_pi (m64_ptr + 6 , copy_2x );
220- _mm_stream_pi (m64_ptr + 7 , copy_2x );
221- m64_ptr += 8 ;
222- n -= 16 ;
223- }
224-
225- uint32_t * uint32_ptr = (uint32_t * ) m64_ptr ;
226- while (n -- )
227- * uint32_ptr ++ = copy ;
228-
229- return dest ;
230- }
231-
232- #endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
0 commit comments