2828#include <stdlib.h>
2929#include <string.h>
3030
31+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
32+ #include <x86intrin.h>
33+ #endif
34+
3135/* Standard C fallbacks */
3236static void * mem_set16_c (void * dest , int c , visual_size_t n );
3337static void * mem_set32_c (void * dest , int c , visual_size_t n );
3438static void * mem_copy_pitch_c (void * dest , const void * src , int pitch1 , int pitch2 , int width , int rows );
3539
3640/* x86 SIMD optimized versions */
3741#if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
38- static void * mem_set16_mmx (void * dest , int c , visual_size_t n );
39- static void * mem_set16_mmx2 (void * dest , int c , visual_size_t n );
42+ static void * mem_set16_simd_x86 (void * dest , int c , visual_size_t n );
4043
41- static void * mem_set32_mmx (void * dest , int c , visual_size_t n );
42- static void * mem_set32_mmx2 (void * dest , int c , visual_size_t n );
44+ static void * mem_set32_simd_x86 (void * dest , int c , visual_size_t n );
4345#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
4446
4547
@@ -58,20 +60,10 @@ void visual_mem_initialize ()
5860 * every time */
5961
6062#if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
61-
62- if (visual_cpu_has_mmx ()) {
63- visual_mem_set16 = mem_set16_mmx ;
64- visual_mem_set32 = mem_set32_mmx ;
65- }
66-
67- /* The k6-II and k6-III don't have mmx2, but of course can use the prefetch
68- * facility that 3dnow provides. */
69-
70- if (visual_cpu_has_mmx2 ()) {
71- visual_mem_set16 = mem_set16_mmx2 ;
72- visual_mem_set32 = mem_set32_mmx2 ;
63+ if (visual_cpu_has_sse ()) {
64+ visual_mem_set16 = mem_set16_simd_x86 ;
65+ visual_mem_set32 = mem_set32_simd_x86 ;
7366 }
74-
7567#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
7668}
7769
@@ -120,9 +112,7 @@ static void *mem_set16_c (void *dest, int c, visual_size_t n)
120112{
121113 uint32_t * d = dest ;
122114 uint16_t * dc = dest ;
123- uint32_t setflag32 =
124- (c & 0xffff ) |
125- ((c << 16 ) & 0xffff0000 );
115+ uint32_t setflag32 = (c & 0xffff ) | ((c << 16 ) & 0xffff0000 );
126116 uint16_t setflag16 = c & 0xffff ;
127117
128118 while (n >= 2 ) {
@@ -169,204 +159,72 @@ static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitc
169159
170160#if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
171161
172- static void * mem_set16_mmx (void * dest , int c , visual_size_t n )
162+ static void * mem_set16_simd_x86 (void * dest , int c , visual_size_t n )
173163{
174- uint32_t * d = dest ;
175- uint16_t * dc = dest ;
176- uint32_t setflag32 =
177- (c & 0xffff ) |
178- ((c << 16 ) & 0xffff0000 );
179- uint16_t setflag16 = c & 0xffff ;
164+ // FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
180165
181- __asm __volatile
182- ("\n\t movd (%0), %%mm0"
183- "\n\t movd (%0), %%mm1"
184- "\n\t psllq $32, %%mm1"
185- "\n\t por %%mm1, %%mm0"
186- "\n\t movq %%mm0, %%mm2"
187- "\n\t movq %%mm0, %%mm1"
188- "\n\t movq %%mm2, %%mm3"
189- "\n\t movq %%mm1, %%mm4"
190- "\n\t movq %%mm0, %%mm5"
191- "\n\t movq %%mm2, %%mm6"
192- "\n\t movq %%mm1, %%mm7"
193- :: "r" (& setflag32 ) : "memory" );
194-
195- while (n >= 64 ) {
196- __asm __volatile
197- ("\n\t movq %%mm0, (%0)"
198- "\n\t movq %%mm1, 8(%0)"
199- "\n\t movq %%mm2, 16(%0)"
200- "\n\t movq %%mm3, 24(%0)"
201- "\n\t movq %%mm4, 32(%0)"
202- "\n\t movq %%mm5, 40(%0)"
203- "\n\t movq %%mm6, 48(%0)"
204- "\n\t movq %%mm7, 56(%0)"
205- :: "r" (d ) : "memory" );
206-
207- d += 16 ;
166+ const uint16_t copy = c & 0xffff ;
167+ const uint32_t copy_2x = (copy & 0xffff ) | ((copy << 16 ) & 0xffff0000 );
168+ const __m64 copy_4x = _mm_set_pi32 (copy_2x , copy_2x );
208169
209- n -= 32 ;
210- }
211-
212- __asm __volatile
213- ("\n\t emms" );
214-
215- while (n >= 2 ) {
216- * d ++ = setflag32 ;
217- n -= 2 ;
218- }
219-
220- dc = (uint16_t * ) d ;
221-
222- while (n -- )
223- * dc ++ = setflag16 ;
224-
225- return dest ;
226- }
227-
228- static void * mem_set16_mmx2 (void * dest , int c , visual_size_t n )
229- {
230- uint32_t * d = dest ;
231- uint16_t * dc = dest ;
232- uint32_t setflag32 =
233- (c & 0xffff ) |
234- ((c << 16 ) & 0xffff0000 );
235- uint16_t setflag16 = c & 0xffff ;
236-
237- __asm __volatile
238- ("\n\t movd (%0), %%mm0"
239- "\n\t movd (%0), %%mm1"
240- "\n\t psllq $32, %%mm1"
241- "\n\t por %%mm1, %%mm0"
242- "\n\t movq %%mm0, %%mm2"
243- "\n\t movq %%mm0, %%mm1"
244- "\n\t movq %%mm2, %%mm3"
245- "\n\t movq %%mm1, %%mm4"
246- "\n\t movq %%mm0, %%mm5"
247- "\n\t movq %%mm2, %%mm6"
248- "\n\t movq %%mm1, %%mm7"
249- :: "r" (& setflag32 ) : "memory" );
170+ __m64 * m64_ptr = (__m64 * ) dest ;
250171
172+ // Copy 32 copies each iteration
251173 while (n >= 32 ) {
252- __asm __volatile
253- ("\n\t movntq %%mm0, (%0)"
254- "\n\t movntq %%mm1, 8(%0)"
255- "\n\t movntq %%mm2, 16(%0)"
256- "\n\t movntq %%mm3, 24(%0)"
257- "\n\t movntq %%mm4, 32(%0)"
258- "\n\t movntq %%mm5, 40(%0)"
259- "\n\t movntq %%mm6, 48(%0)"
260- "\n\t movntq %%mm7, 56(%0)"
261- :: "r" (d ) : "memory" );
262-
263- d += 16 ;
264-
174+ _mm_stream_pi (m64_ptr , copy_4x );
175+ _mm_stream_pi (m64_ptr + 1 , copy_4x );
176+ _mm_stream_pi (m64_ptr + 2 , copy_4x );
177+ _mm_stream_pi (m64_ptr + 3 , copy_4x );
178+ _mm_stream_pi (m64_ptr + 4 , copy_4x );
179+ _mm_stream_pi (m64_ptr + 5 , copy_4x );
180+ _mm_stream_pi (m64_ptr + 6 , copy_4x );
181+ _mm_stream_pi (m64_ptr + 7 , copy_4x );
182+ m64_ptr += 8 ;
265183 n -= 32 ;
266184 }
267185
268- __asm __volatile
269- ("\n\t emms" );
186+ uint32_t * uint32_ptr = (uint32_t * ) m64_ptr ;
270187
271188 while (n >= 2 ) {
272- * d ++ = setflag32 ;
189+ * uint32_ptr ++ = copy_2x ;
273190 n -= 2 ;
274191 }
275192
276- dc = (uint16_t * ) d ;
277-
278- while (n -- )
279- * dc ++ = setflag16 ;
280-
281- return dest ;
282- }
283-
284- static void * mem_set32_mmx (void * dest , int c , visual_size_t n )
285- {
286- uint32_t * d = dest ;
287- uint32_t setflag32 = c ;
288-
289- __asm __volatile
290- ("\n\t movd (%0), %%mm0"
291- "\n\t movd (%0), %%mm1"
292- "\n\t psllq $32, %%mm1"
293- "\n\t por %%mm1, %%mm0"
294- "\n\t movq %%mm0, %%mm2"
295- "\n\t movq %%mm0, %%mm1"
296- "\n\t movq %%mm2, %%mm3"
297- "\n\t movq %%mm1, %%mm4"
298- "\n\t movq %%mm0, %%mm5"
299- "\n\t movq %%mm2, %%mm6"
300- "\n\t movq %%mm1, %%mm7"
301- :: "r" (& setflag32 ) : "memory" );
302-
303- while (n >= 64 ) {
304- __asm __volatile
305- ("\n\t movq %%mm0, (%0)"
306- "\n\t movq %%mm1, 8(%0)"
307- "\n\t movq %%mm2, 16(%0)"
308- "\n\t movq %%mm3, 24(%0)"
309- "\n\t movq %%mm4, 32(%0)"
310- "\n\t movq %%mm5, 40(%0)"
311- "\n\t movq %%mm6, 48(%0)"
312- "\n\t movq %%mm7, 56(%0)"
313- :: "r" (d ) : "memory" );
314-
315- d += 16 ;
316-
317- n -= 16 ;
318- }
319-
320- __asm __volatile
321- ("\n\t emms" );
193+ uint16_t * uint16_ptr = (uint16_t * ) uint32_ptr ;
194+ const uint16_t setflag16 = c & 0xffff ;
322195
323196 while (n -- )
324- * d ++ = setflag32 ;
197+ * uint16_ptr ++ = setflag16 ;
325198
326199 return dest ;
327200}
328201
329- static void * mem_set32_mmx2 (void * dest , int c , visual_size_t n )
202+ static void * mem_set32_simd_x86 (void * dest , int c , visual_size_t n )
330203{
331- uint32_t * d = dest ;
332- uint32_t setflag32 = c ;
333-
334- __asm __volatile
335- ("\n\t movd (%0), %%mm0"
336- "\n\t movd (%0), %%mm1"
337- "\n\t psllq $32, %%mm1"
338- "\n\t por %%mm1, %%mm0"
339- "\n\t movq %%mm0, %%mm2"
340- "\n\t movq %%mm0, %%mm1"
341- "\n\t movq %%mm2, %%mm3"
342- "\n\t movq %%mm1, %%mm4"
343- "\n\t movq %%mm0, %%mm5"
344- "\n\t movq %%mm2, %%mm6"
345- "\n\t movq %%mm1, %%mm7"
346- :: "r" (& setflag32 ) : "memory" );
347-
348- while (n >= 64 ) {
349- __asm __volatile
350- ("\n\t movntq %%mm0, (%0)"
351- "\n\t movntq %%mm1, 8(%0)"
352- "\n\t movntq %%mm2, 16(%0)"
353- "\n\t movntq %%mm3, 24(%0)"
354- "\n\t movntq %%mm4, 32(%0)"
355- "\n\t movntq %%mm5, 40(%0)"
356- "\n\t movntq %%mm6, 48(%0)"
357- "\n\t movntq %%mm7, 56(%0)"
358- :: "r" (d ) : "memory" );
359-
360- d += 16 ;
361-
204+ // FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
205+
206+ const uint32_t copy = c ;
207+ const __m64 copy_2x = _mm_set_pi32 (copy , copy );
208+
209+ __m64 * m64_ptr = (__m64 * ) dest ;
210+
211+ // Copy 16 copies each iteration
212+ while (n >= 16 ) {
213+ _mm_stream_pi (m64_ptr , copy_2x );
214+ _mm_stream_pi (m64_ptr + 1 , copy_2x );
215+ _mm_stream_pi (m64_ptr + 2 , copy_2x );
216+ _mm_stream_pi (m64_ptr + 3 , copy_2x );
217+ _mm_stream_pi (m64_ptr + 4 , copy_2x );
218+ _mm_stream_pi (m64_ptr + 5 , copy_2x );
219+ _mm_stream_pi (m64_ptr + 6 , copy_2x );
220+ _mm_stream_pi (m64_ptr + 7 , copy_2x );
221+ m64_ptr += 8 ;
362222 n -= 16 ;
363223 }
364224
365- __asm __volatile
366- ("\n\t emms" );
367-
225+ uint32_t * uint32_ptr = (uint32_t * ) m64_ptr ;
368226 while (n -- )
369- * d ++ = setflag32 ;
227+ * uint32_ptr ++ = copy ;
370228
371229 return dest ;
372230}
0 commit comments