Skip to content

Commit 98aeeaa

Browse files
committed
Core (Memory): Rewrite visual_mem_set{16,32}_c() and rely on Clang/GCC auto-vectorization.
1 parent 1712381 commit 98aeeaa

File tree

1 file changed

+43
-114
lines changed

1 file changed

+43
-114
lines changed

libvisual/libvisual/lv_mem.c

Lines changed: 43 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,11 @@
2828
#include <stdlib.h>
2929
#include <string.h>
3030

31-
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
32-
#include <x86intrin.h>
33-
#endif
34-
3531
/* Standard C fallbacks */
3632
static void *mem_set16_c (void *dest, int c, visual_size_t n);
3733
static void *mem_set32_c (void *dest, int c, visual_size_t n);
3834
static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitch2, int width, int rows);
3935

40-
/* x86 SIMD optimized versions */
41-
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
42-
static void *mem_set16_simd_x86 (void *dest, int c, visual_size_t n);
43-
44-
static void *mem_set32_simd_x86 (void *dest, int c, visual_size_t n);
45-
#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
46-
47-
4836
/* Optimal performance functions set by visual_mem_initialize(). */
4937
VisMemCopyFunc visual_mem_copy = memcpy;
5038
VisMemCopyPitchFunc visual_mem_copy_pitch = mem_copy_pitch_c;
@@ -56,15 +44,7 @@ VisMemSet32Func visual_mem_set32 = mem_set32_c;
5644

5745
void visual_mem_initialize ()
5846
{
59-
/* Arranged from slow to fast, so the slower version gets overloaded
60-
* every time */
61-
62-
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
63-
if (visual_cpu_has_sse ()) {
64-
visual_mem_set16 = mem_set16_simd_x86;
65-
visual_mem_set32 = mem_set32_simd_x86;
66-
}
67-
#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
47+
/* Nothing to do */
6848
}
6949

7050
void *visual_mem_malloc (visual_size_t nbytes)
@@ -107,40 +87,63 @@ void visual_mem_free (void *ptr)
10787
free (ptr);
10888
}
10989

110-
/* Memset functions, 2 byte memset */
11190
static void *mem_set16_c (void *dest, int c, visual_size_t n)
11291
{
113-
uint32_t *d = dest;
114-
uint16_t *dc = dest;
115-
uint32_t setflag32 = (c & 0xffff) | ((c << 16) & 0xffff0000);
116-
uint16_t setflag16 = c & 0xffff;
117-
118-
while (n >= 2) {
119-
*d++ = setflag32;
120-
n -= 2;
121-
}
92+
uint64_t *u64_ptr = dest;
93+
94+
uint16_t copy = c & 0xffff;
95+
uint32_t copy_x2 = copy | (copy << 16);
96+
uint64_t copy_x4 = copy_x2 | ((uint64_t)copy_x2 << 32);
12297

123-
dc = (uint16_t *) d;
98+
while (n >= 32) {
99+
u64_ptr[0] = copy_x4;
100+
u64_ptr[1] = copy_x4;
101+
u64_ptr[2] = copy_x4;
102+
u64_ptr[3] = copy_x4;
103+
u64_ptr[4] = copy_x4;
104+
u64_ptr[5] = copy_x4;
105+
u64_ptr[6] = copy_x4;
106+
u64_ptr[7] = copy_x4;
107+
n -= 32;
108+
u64_ptr += 8;
109+
}
124110

125-
while (n--)
126-
*dc++ = setflag16;
111+
uint16_t* u16_ptr = (uint16_t *) u64_ptr;
112+
while (n--) {
113+
*u16_ptr++ = copy;
114+
}
127115

128116
return dest;
129117
}
130118

131-
/* Memset functions, 4 byte memset */
132119
static void *mem_set32_c (void *dest, int c, visual_size_t n)
133120
{
134-
uint32_t *d = dest;
135-
uint32_t setflag32 = c;
121+
uint64_t *u64_ptr = dest;
136122

137-
while (n--)
138-
*d++ = setflag32;
123+
uint32_t copy = c;
124+
uint64_t copy_x2 = ((uint64_t) copy) | ((uint64_t) copy << 32);
125+
126+
while (n >= 16) {
127+
u64_ptr[0] = copy_x2;
128+
u64_ptr[1] = copy_x2;
129+
u64_ptr[2] = copy_x2;
130+
u64_ptr[3] = copy_x2;
131+
u64_ptr[4] = copy_x2;
132+
u64_ptr[5] = copy_x2;
133+
u64_ptr[6] = copy_x2;
134+
u64_ptr[7] = copy_x2;
135+
n -= 16;
136+
u64_ptr += 8;
137+
}
138+
139+
uint32_t* u32_ptr = (uint32_t *) u64_ptr;
140+
while (n--) {
141+
*u32_ptr++ = copy;
142+
}
139143

140144
return dest;
141145
}
142146

143-
/* Memcopy with pitch functions */
144147
static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitch2, int row_bytes, int rows)
145148
{
146149
uint8_t *d = dest;
@@ -156,77 +159,3 @@ static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitc
156159

157160
return dest;
158161
}
159-
160-
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
161-
162-
static void *mem_set16_simd_x86 (void *dest, int c, visual_size_t n)
163-
{
164-
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
165-
166-
const uint16_t copy = c & 0xffff;
167-
const uint32_t copy_2x = (copy & 0xffff) | ((copy << 16) & 0xffff0000);
168-
const __m64 copy_4x = _mm_set_pi32 (copy_2x, copy_2x);
169-
170-
__m64 *m64_ptr = (__m64 *) dest;
171-
172-
// Copy 32 copies each iteration
173-
while (n >= 32) {
174-
_mm_stream_pi (m64_ptr, copy_4x);
175-
_mm_stream_pi (m64_ptr + 1, copy_4x);
176-
_mm_stream_pi (m64_ptr + 2, copy_4x);
177-
_mm_stream_pi (m64_ptr + 3, copy_4x);
178-
_mm_stream_pi (m64_ptr + 4, copy_4x);
179-
_mm_stream_pi (m64_ptr + 5, copy_4x);
180-
_mm_stream_pi (m64_ptr + 6, copy_4x);
181-
_mm_stream_pi (m64_ptr + 7, copy_4x);
182-
m64_ptr += 8;
183-
n -= 32;
184-
}
185-
186-
uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
187-
188-
while (n >= 2) {
189-
*uint32_ptr++ = copy_2x;
190-
n -= 2;
191-
}
192-
193-
uint16_t *uint16_ptr = (uint16_t *) uint32_ptr;
194-
const uint16_t setflag16 = c & 0xffff;
195-
196-
while (n--)
197-
*uint16_ptr++ = setflag16;
198-
199-
return dest;
200-
}
201-
202-
static void *mem_set32_simd_x86 (void *dest, int c, visual_size_t n)
203-
{
204-
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
205-
206-
const uint32_t copy = c;
207-
const __m64 copy_2x = _mm_set_pi32 (copy, copy);
208-
209-
__m64 *m64_ptr = (__m64 *) dest;
210-
211-
// Copy 16 copies each iteration
212-
while (n >= 16) {
213-
_mm_stream_pi (m64_ptr, copy_2x);
214-
_mm_stream_pi (m64_ptr + 1, copy_2x);
215-
_mm_stream_pi (m64_ptr + 2, copy_2x);
216-
_mm_stream_pi (m64_ptr + 3, copy_2x);
217-
_mm_stream_pi (m64_ptr + 4, copy_2x);
218-
_mm_stream_pi (m64_ptr + 5, copy_2x);
219-
_mm_stream_pi (m64_ptr + 6, copy_2x);
220-
_mm_stream_pi (m64_ptr + 7, copy_2x);
221-
m64_ptr += 8;
222-
n -= 16;
223-
}
224-
225-
uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
226-
while (n--)
227-
*uint32_ptr++ = copy;
228-
229-
return dest;
230-
}
231-
232-
#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */

0 commit comments

Comments
 (0)