Skip to content

Commit dd76f9c

Browse files
committed
Core (Memory): Add SSE2 intrinsics implementation of mem_set16() and mem_set32().
1 parent 98aeeaa commit dd76f9c

File tree

1 file changed

+78
-2
lines changed

1 file changed

+78
-2
lines changed

libvisual/libvisual/lv_mem.c

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* Libvisual - The audio visualisation framework.
22
*
3-
* Copyright (C) 2012 Libvisual team
3+
* Copyright (C) 2012-2023 Libvisual team
44
* 2004-2006 Dennis Smit
55
*
66
* Authors: Dennis Smit <[email protected]>
@@ -28,11 +28,20 @@
2828
#include <stdlib.h>
2929
#include <string.h>
3030

31+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
32+
#include <x86intrin.h>
33+
#endif
34+
3135
/* Standard C fallbacks */
3236
static void *mem_set16_c (void *dest, int c, visual_size_t n);
3337
static void *mem_set32_c (void *dest, int c, visual_size_t n);
3438
static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitch2, int width, int rows);
3539

40+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
41+
static void *mem_set16_x86_sse2 (void *dest, int c, size_t n);
42+
static void *mem_set32_x86_sse2 (void *dest, int c, size_t n);
43+
#endif
44+
3645
/* Optimal performance functions set by visual_mem_initialize(). */
3746
VisMemCopyFunc visual_mem_copy = memcpy;
3847
VisMemCopyPitchFunc visual_mem_copy_pitch = mem_copy_pitch_c;
@@ -44,7 +53,14 @@ VisMemSet32Func visual_mem_set32 = mem_set32_c;
4453

4554
void visual_mem_initialize ()
4655
{
47-
/* Nothing to do */
56+
/* Select optimized routines for selected CPU architectures. */
57+
58+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
59+
if (visual_cpu_has_sse2 ()) {
60+
visual_mem_set16 = mem_set16_x86_sse2;
61+
visual_mem_set32 = mem_set32_x86_sse2;
62+
}
63+
#endif
4864
}
4965

5066
void *visual_mem_malloc (visual_size_t nbytes)
@@ -116,6 +132,37 @@ static void *mem_set16_c (void *dest, int c, visual_size_t n)
116132
return dest;
117133
}
118134

135+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
136+
void *mem_set16_x86_sse2 (void *dest, int c, size_t n)
137+
{
138+
const uint16_t copy = c & 0xffff;
139+
const uint32_t copy_2x = copy | (copy << 16);
140+
const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);
141+
142+
__m128i *m128i_ptr = (__m128i *) dest;
143+
144+
while (n >= 64) {
145+
_mm_storeu_si128 (m128i_ptr, copy_4x);
146+
_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
147+
_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
148+
_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
149+
_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
150+
_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
151+
_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
152+
_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
153+
n -= 64;
154+
m128i_ptr += 8;
155+
}
156+
157+
uint16_t *uint16_ptr = (uint16_t *) m128i_ptr;
158+
while (n--) {
159+
*uint16_ptr++ = copy;
160+
}
161+
162+
return dest;
163+
}
164+
#endif /* defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) */
165+
119166
static void *mem_set32_c (void *dest, int c, visual_size_t n)
120167
{
121168
uint64_t *u64_ptr = dest;
@@ -144,6 +191,35 @@ static void *mem_set32_c (void *dest, int c, visual_size_t n)
144191
return dest;
145192
}
146193

194+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
195+
static void *mem_set32_x86_sse2 (void *dest, int c, size_t n)
196+
{
197+
const uint32_t copy = c;
198+
const __m128i copy_4x = _mm_set_epi32 (copy, copy, copy, copy);
199+
200+
__m128i *m128i_ptr = (__m128i *) dest;
201+
while (n >= 32) {
202+
_mm_storeu_si128 (m128i_ptr, copy_4x);
203+
_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
204+
_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
205+
_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
206+
_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
207+
_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
208+
_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
209+
_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
210+
n -= 32;
211+
m128i_ptr += 8;
212+
}
213+
214+
uint32_t *uint32_ptr = (uint32_t *) m128i_ptr;
215+
while (n--) {
216+
*uint32_ptr++ = copy;
217+
}
218+
219+
return dest;
220+
}
221+
#endif /* defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) */
222+
147223
static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitch2, int row_bytes, int rows)
148224
{
149225
uint8_t *d = dest;

0 commit comments

Comments
 (0)