Skip to content

Commit 1712381

Browse files
committed
Core (Memory): Re-implement visual_mem_set{16,32} using SSE SIMD intrinsics (#15).
1 parent 3221337 commit 1712381

File tree

3 files changed

+124
-194
lines changed

3 files changed

+124
-194
lines changed

libvisual/libvisual/lv_mem.c

Lines changed: 52 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,20 @@
2828
#include <stdlib.h>
2929
#include <string.h>
3030

31+
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
32+
#include <x86intrin.h>
33+
#endif
34+
3135
/* Standard C fallbacks */
3236
static void *mem_set16_c (void *dest, int c, visual_size_t n);
3337
static void *mem_set32_c (void *dest, int c, visual_size_t n);
3438
static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitch2, int width, int rows);
3539

3640
/* x86 SIMD optimized versions */
3741
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
38-
static void *mem_set16_mmx (void *dest, int c, visual_size_t n);
39-
static void *mem_set16_mmx2 (void *dest, int c, visual_size_t n);
42+
static void *mem_set16_simd_x86 (void *dest, int c, visual_size_t n);
4043

41-
static void *mem_set32_mmx (void *dest, int c, visual_size_t n);
42-
static void *mem_set32_mmx2 (void *dest, int c, visual_size_t n);
44+
static void *mem_set32_simd_x86 (void *dest, int c, visual_size_t n);
4345
#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
4446

4547

@@ -58,20 +60,10 @@ void visual_mem_initialize ()
5860
* every time */
5961

6062
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
61-
62-
if (visual_cpu_has_mmx ()) {
63-
visual_mem_set16 = mem_set16_mmx;
64-
visual_mem_set32 = mem_set32_mmx;
65-
}
66-
67-
/* The k6-II and k6-III don't have mmx2, but of course can use the prefetch
68-
* facility that 3dnow provides. */
69-
70-
if (visual_cpu_has_mmx2 ()) {
71-
visual_mem_set16 = mem_set16_mmx2;
72-
visual_mem_set32 = mem_set32_mmx2;
63+
if (visual_cpu_has_sse ()) {
64+
visual_mem_set16 = mem_set16_simd_x86;
65+
visual_mem_set32 = mem_set32_simd_x86;
7366
}
74-
7567
#endif /* VISUAL_ARCH_X86 || VISUAL_ARCH_X86_64 */
7668
}
7769

@@ -120,9 +112,7 @@ static void *mem_set16_c (void *dest, int c, visual_size_t n)
120112
{
121113
uint32_t *d = dest;
122114
uint16_t *dc = dest;
123-
uint32_t setflag32 =
124-
(c & 0xffff) |
125-
((c << 16) & 0xffff0000);
115+
uint32_t setflag32 = (c & 0xffff) | ((c << 16) & 0xffff0000);
126116
uint16_t setflag16 = c & 0xffff;
127117

128118
while (n >= 2) {
@@ -169,204 +159,72 @@ static void *mem_copy_pitch_c (void *dest, const void *src, int pitch1, int pitc
169159

170160
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
171161

172-
static void *mem_set16_mmx (void *dest, int c, visual_size_t n)
162+
static void *mem_set16_simd_x86 (void *dest, int c, visual_size_t n)
173163
{
174-
uint32_t *d = dest;
175-
uint16_t *dc = dest;
176-
uint32_t setflag32 =
177-
(c & 0xffff) |
178-
((c << 16) & 0xffff0000);
179-
uint16_t setflag16 = c & 0xffff;
164+
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
180165

181-
__asm __volatile
182-
("\n\t movd (%0), %%mm0"
183-
"\n\t movd (%0), %%mm1"
184-
"\n\t psllq $32, %%mm1"
185-
"\n\t por %%mm1, %%mm0"
186-
"\n\t movq %%mm0, %%mm2"
187-
"\n\t movq %%mm0, %%mm1"
188-
"\n\t movq %%mm2, %%mm3"
189-
"\n\t movq %%mm1, %%mm4"
190-
"\n\t movq %%mm0, %%mm5"
191-
"\n\t movq %%mm2, %%mm6"
192-
"\n\t movq %%mm1, %%mm7"
193-
:: "r" (&setflag32) : "memory");
194-
195-
while (n >= 64) {
196-
__asm __volatile
197-
("\n\t movq %%mm0, (%0)"
198-
"\n\t movq %%mm1, 8(%0)"
199-
"\n\t movq %%mm2, 16(%0)"
200-
"\n\t movq %%mm3, 24(%0)"
201-
"\n\t movq %%mm4, 32(%0)"
202-
"\n\t movq %%mm5, 40(%0)"
203-
"\n\t movq %%mm6, 48(%0)"
204-
"\n\t movq %%mm7, 56(%0)"
205-
:: "r" (d) : "memory");
206-
207-
d += 16;
166+
const uint16_t copy = c & 0xffff;
167+
const uint32_t copy_2x = (copy & 0xffff) | ((copy << 16) & 0xffff0000);
168+
const __m64 copy_4x = _mm_set_pi32 (copy_2x, copy_2x);
208169

209-
n -= 32;
210-
}
211-
212-
__asm __volatile
213-
("\n\t emms");
214-
215-
while (n >= 2) {
216-
*d++ = setflag32;
217-
n -= 2;
218-
}
219-
220-
dc = (uint16_t *) d;
221-
222-
while (n--)
223-
*dc++ = setflag16;
224-
225-
return dest;
226-
}
227-
228-
static void *mem_set16_mmx2 (void *dest, int c, visual_size_t n)
229-
{
230-
uint32_t *d = dest;
231-
uint16_t *dc = dest;
232-
uint32_t setflag32 =
233-
(c & 0xffff) |
234-
((c << 16) & 0xffff0000);
235-
uint16_t setflag16 = c & 0xffff;
236-
237-
__asm __volatile
238-
("\n\t movd (%0), %%mm0"
239-
"\n\t movd (%0), %%mm1"
240-
"\n\t psllq $32, %%mm1"
241-
"\n\t por %%mm1, %%mm0"
242-
"\n\t movq %%mm0, %%mm2"
243-
"\n\t movq %%mm0, %%mm1"
244-
"\n\t movq %%mm2, %%mm3"
245-
"\n\t movq %%mm1, %%mm4"
246-
"\n\t movq %%mm0, %%mm5"
247-
"\n\t movq %%mm2, %%mm6"
248-
"\n\t movq %%mm1, %%mm7"
249-
:: "r" (&setflag32) : "memory");
170+
__m64 *m64_ptr = (__m64 *) dest;
250171

172+
// Copy 32 copies each iteration
251173
while (n >= 32) {
252-
__asm __volatile
253-
("\n\t movntq %%mm0, (%0)"
254-
"\n\t movntq %%mm1, 8(%0)"
255-
"\n\t movntq %%mm2, 16(%0)"
256-
"\n\t movntq %%mm3, 24(%0)"
257-
"\n\t movntq %%mm4, 32(%0)"
258-
"\n\t movntq %%mm5, 40(%0)"
259-
"\n\t movntq %%mm6, 48(%0)"
260-
"\n\t movntq %%mm7, 56(%0)"
261-
:: "r" (d) : "memory");
262-
263-
d += 16;
264-
174+
_mm_stream_pi (m64_ptr, copy_4x);
175+
_mm_stream_pi (m64_ptr + 1, copy_4x);
176+
_mm_stream_pi (m64_ptr + 2, copy_4x);
177+
_mm_stream_pi (m64_ptr + 3, copy_4x);
178+
_mm_stream_pi (m64_ptr + 4, copy_4x);
179+
_mm_stream_pi (m64_ptr + 5, copy_4x);
180+
_mm_stream_pi (m64_ptr + 6, copy_4x);
181+
_mm_stream_pi (m64_ptr + 7, copy_4x);
182+
m64_ptr += 8;
265183
n -= 32;
266184
}
267185

268-
__asm __volatile
269-
("\n\t emms");
186+
uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
270187

271188
while (n >= 2) {
272-
*d++ = setflag32;
189+
*uint32_ptr++ = copy_2x;
273190
n -= 2;
274191
}
275192

276-
dc = (uint16_t *) d;
277-
278-
while (n--)
279-
*dc++ = setflag16;
280-
281-
return dest;
282-
}
283-
284-
static void *mem_set32_mmx (void *dest, int c, visual_size_t n)
285-
{
286-
uint32_t *d = dest;
287-
uint32_t setflag32 = c;
288-
289-
__asm __volatile
290-
("\n\t movd (%0), %%mm0"
291-
"\n\t movd (%0), %%mm1"
292-
"\n\t psllq $32, %%mm1"
293-
"\n\t por %%mm1, %%mm0"
294-
"\n\t movq %%mm0, %%mm2"
295-
"\n\t movq %%mm0, %%mm1"
296-
"\n\t movq %%mm2, %%mm3"
297-
"\n\t movq %%mm1, %%mm4"
298-
"\n\t movq %%mm0, %%mm5"
299-
"\n\t movq %%mm2, %%mm6"
300-
"\n\t movq %%mm1, %%mm7"
301-
:: "r" (&setflag32) : "memory");
302-
303-
while (n >= 64) {
304-
__asm __volatile
305-
("\n\t movq %%mm0, (%0)"
306-
"\n\t movq %%mm1, 8(%0)"
307-
"\n\t movq %%mm2, 16(%0)"
308-
"\n\t movq %%mm3, 24(%0)"
309-
"\n\t movq %%mm4, 32(%0)"
310-
"\n\t movq %%mm5, 40(%0)"
311-
"\n\t movq %%mm6, 48(%0)"
312-
"\n\t movq %%mm7, 56(%0)"
313-
:: "r" (d) : "memory");
314-
315-
d += 16;
316-
317-
n -= 16;
318-
}
319-
320-
__asm __volatile
321-
("\n\t emms");
193+
uint16_t *uint16_ptr = (uint16_t *) uint32_ptr;
194+
const uint16_t setflag16 = c & 0xffff;
322195

323196
while (n--)
324-
*d++ = setflag32;
197+
*uint16_ptr++ = setflag16;
325198

326199
return dest;
327200
}
328201

329-
static void *mem_set32_mmx2 (void *dest, int c, visual_size_t n)
202+
static void *mem_set32_simd_x86 (void *dest, int c, visual_size_t n)
330203
{
331-
uint32_t *d = dest;
332-
uint32_t setflag32 = c;
333-
334-
__asm __volatile
335-
("\n\t movd (%0), %%mm0"
336-
"\n\t movd (%0), %%mm1"
337-
"\n\t psllq $32, %%mm1"
338-
"\n\t por %%mm1, %%mm0"
339-
"\n\t movq %%mm0, %%mm2"
340-
"\n\t movq %%mm0, %%mm1"
341-
"\n\t movq %%mm2, %%mm3"
342-
"\n\t movq %%mm1, %%mm4"
343-
"\n\t movq %%mm0, %%mm5"
344-
"\n\t movq %%mm2, %%mm6"
345-
"\n\t movq %%mm1, %%mm7"
346-
:: "r" (&setflag32) : "memory");
347-
348-
while (n >= 64) {
349-
__asm __volatile
350-
("\n\t movntq %%mm0, (%0)"
351-
"\n\t movntq %%mm1, 8(%0)"
352-
"\n\t movntq %%mm2, 16(%0)"
353-
"\n\t movntq %%mm3, 24(%0)"
354-
"\n\t movntq %%mm4, 32(%0)"
355-
"\n\t movntq %%mm5, 40(%0)"
356-
"\n\t movntq %%mm6, 48(%0)"
357-
"\n\t movntq %%mm7, 56(%0)"
358-
:: "r" (d) : "memory");
359-
360-
d += 16;
361-
204+
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
205+
206+
const uint32_t copy = c;
207+
const __m64 copy_2x = _mm_set_pi32 (copy, copy);
208+
209+
__m64 *m64_ptr = (__m64 *) dest;
210+
211+
// Copy 16 copies each iteration
212+
while (n >= 16) {
213+
_mm_stream_pi (m64_ptr, copy_2x);
214+
_mm_stream_pi (m64_ptr + 1, copy_2x);
215+
_mm_stream_pi (m64_ptr + 2, copy_2x);
216+
_mm_stream_pi (m64_ptr + 3, copy_2x);
217+
_mm_stream_pi (m64_ptr + 4, copy_2x);
218+
_mm_stream_pi (m64_ptr + 5, copy_2x);
219+
_mm_stream_pi (m64_ptr + 6, copy_2x);
220+
_mm_stream_pi (m64_ptr + 7, copy_2x);
221+
m64_ptr += 8;
362222
n -= 16;
363223
}
364224

365-
__asm __volatile
366-
("\n\t emms");
367-
225+
uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
368226
while (n--)
369-
*d++ = setflag32;
227+
*uint32_ptr++ = copy;
370228

371229
return dest;
372230
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
LV_BUILD_TEST(mem_set_test
2+
SOURCES mem_set_test.cpp
3+
)
4+
15
LV_BUILD_TEST(aligned_allocation_test
26
SOURCES aligned_allocation_test.cpp
37
)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#include "test.h"
2+
#include <cstdint>
3+
#include <libvisual/libvisual.h>
4+
#include <libvisual/lv_mem.h>
5+
#include <cstdint>
6+
#include <array>
7+
#include <algorithm>
8+
#include <vector>
9+
10+
namespace {
11+
12+
void test_mem_set16 ()
13+
{
14+
constexpr std::uint16_t test_pattern = 0x1234;
15+
16+
constexpr auto test {[test_pattern] (auto x) {
17+
return x == test_pattern;
18+
}};
19+
20+
for (auto test_size = 1; test_size <= 128; test_size++) {
21+
// Allocate extra elements in front and behind of test memory area.
22+
std::vector<std::uint16_t> vec (test_size + 2);
23+
24+
visual_mem_set16 (&vec[1], test_pattern, test_size);
25+
26+
// Test for buffer overruns
27+
LV_TEST_ASSERT (vec.front () == 0);
28+
LV_TEST_ASSERT (vec.back () == 0);
29+
30+
// Test data area
31+
LV_TEST_ASSERT (std::all_of (++begin (vec), --end (vec), test));
32+
}
33+
}
34+
35+
void test_mem_set32 ()
36+
{
37+
constexpr std::uint32_t test_pattern = 0x12345678;
38+
39+
constexpr auto test {[test_pattern] (auto x) {
40+
return x == test_pattern;
41+
}};
42+
43+
for (auto test_size = 1; test_size <= 128; test_size++) {
44+
// Allocate extra elements in front and behind of test memory area.
45+
std::vector<std::uint32_t> vec (test_size + 2);
46+
47+
visual_mem_set32 (&vec[1], test_pattern, test_size);
48+
49+
// Test for buffer overruns
50+
LV_TEST_ASSERT (vec.front () == 0);
51+
LV_TEST_ASSERT (vec.back () == 0);
52+
53+
// Test data area
54+
LV_TEST_ASSERT (std::all_of (++begin (vec), --end (vec), test));
55+
}
56+
}
57+
58+
} // anonymous namespace
59+
60+
int main (int argc, char* argv[])
61+
{
62+
LV::System::init (argc, argv);
63+
64+
test_mem_set16 ();
65+
test_mem_set32 ();
66+
67+
LV::System::destroy ();
68+
}

0 commit comments

Comments
 (0)