11/* Libvisual - The audio visualisation framework.
22 *
3- * Copyright (C) 2012 Libvisual team
3+ * Copyright (C) 2012-2023 Libvisual team
44 * 2004-2006 Dennis Smit
55 *
66 * Authors: Dennis Smit <[email protected] > 2828#include <stdlib.h>
2929#include <string.h>
3030
31+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
32+ #include <x86intrin.h>
33+ #endif
34+
3135/* Standard C fallbacks */
3236static void * mem_set16_c (void * dest , int c , visual_size_t n );
3337static void * mem_set32_c (void * dest , int c , visual_size_t n );
3438static void * mem_copy_pitch_c (void * dest , const void * src , int pitch1 , int pitch2 , int width , int rows );
3539
40+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
41+ static void * mem_set16_x86_sse2 (void * dest , int c , size_t n );
42+ static void * mem_set32_x86_sse2 (void * dest , int c , size_t n );
43+ #endif
44+
3645/* Optimal performance functions set by visual_mem_initialize(). */
3746VisMemCopyFunc visual_mem_copy = memcpy ;
3847VisMemCopyPitchFunc visual_mem_copy_pitch = mem_copy_pitch_c ;
@@ -44,7 +53,14 @@ VisMemSet32Func visual_mem_set32 = mem_set32_c;
4453
4554void visual_mem_initialize ()
4655{
47- /* Nothing to do */
56+ /* Select optimized routines for selected CPU architectures. */
57+
58+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
59+ if (visual_cpu_has_sse2 ()) {
60+ visual_mem_set16 = mem_set16_x86_sse2 ;
61+ visual_mem_set32 = mem_set32_x86_sse2 ;
62+ }
63+ #endif
4864}
4965
5066void * visual_mem_malloc (visual_size_t nbytes )
@@ -116,6 +132,37 @@ static void *mem_set16_c (void *dest, int c, visual_size_t n)
116132 return dest ;
117133}
118134
135+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
136+ void * mem_set16_x86_sse2 (void * dest , int c , size_t n )
137+ {
138+ const uint16_t copy = c & 0xffff ;
139+ const uint32_t copy_2x = copy | (copy << 16 );
140+ const __m128i copy_4x = _mm_set_epi32 (copy_2x , copy_2x , copy_2x , copy_2x );
141+
142+ __m128i * m128i_ptr = (__m128i * ) dest ;
143+
144+ while (n >= 64 ) {
145+ _mm_storeu_si128 (m128i_ptr , copy_4x );
146+ _mm_storeu_si128 (m128i_ptr + 1 , copy_4x );
147+ _mm_storeu_si128 (m128i_ptr + 2 , copy_4x );
148+ _mm_storeu_si128 (m128i_ptr + 3 , copy_4x );
149+ _mm_storeu_si128 (m128i_ptr + 4 , copy_4x );
150+ _mm_storeu_si128 (m128i_ptr + 5 , copy_4x );
151+ _mm_storeu_si128 (m128i_ptr + 6 , copy_4x );
152+ _mm_storeu_si128 (m128i_ptr + 7 , copy_4x );
153+ n -= 64 ;
154+ m128i_ptr += 8 ;
155+ }
156+
157+ uint16_t * uint16_ptr = (uint16_t * ) m128i_ptr ;
158+ while (n -- ) {
159+ * uint16_ptr ++ = copy ;
160+ }
161+
162+ return dest ;
163+ }
164+ #endif /* defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) */
165+
119166static void * mem_set32_c (void * dest , int c , visual_size_t n )
120167{
121168 uint64_t * u64_ptr = dest ;
@@ -144,6 +191,35 @@ static void *mem_set32_c (void *dest, int c, visual_size_t n)
144191 return dest ;
145192}
146193
194+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
195+ static void * mem_set32_x86_sse2 (void * dest , int c , size_t n )
196+ {
197+ const uint32_t copy = c ;
198+ const __m128i copy_4x = _mm_set_epi32 (copy , copy , copy , copy );
199+
200+ __m128i * m128i_ptr = (__m128i * ) dest ;
201+ while (n >= 32 ) {
202+ _mm_storeu_si128 (m128i_ptr , copy_4x );
203+ _mm_storeu_si128 (m128i_ptr + 1 , copy_4x );
204+ _mm_storeu_si128 (m128i_ptr + 2 , copy_4x );
205+ _mm_storeu_si128 (m128i_ptr + 3 , copy_4x );
206+ _mm_storeu_si128 (m128i_ptr + 4 , copy_4x );
207+ _mm_storeu_si128 (m128i_ptr + 5 , copy_4x );
208+ _mm_storeu_si128 (m128i_ptr + 6 , copy_4x );
209+ _mm_storeu_si128 (m128i_ptr + 7 , copy_4x );
210+ n -= 32 ;
211+ m128i_ptr += 8 ;
212+ }
213+
214+ uint32_t * uint32_ptr = (uint32_t * ) m128i_ptr ;
215+ while (n -- ) {
216+ * uint32_ptr ++ = copy ;
217+ }
218+
219+ return dest ;
220+ }
221+ #endif /* defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) */
222+
147223static void * mem_copy_pitch_c (void * dest , const void * src , int pitch1 , int pitch2 , int row_bytes , int rows )
148224{
149225 uint8_t * d = dest ;
0 commit comments