Skip to content

Commit bb74007

Browse files
committed
Add explicit attribute target
1 parent bb8a978 commit bb74007

File tree

5 files changed

+72
-41
lines changed

5 files changed

+72
-41
lines changed

SPOUTSDK/SpoutDirectX/SpoutDX/Tutorial04_Lib/include/SpoutCopy.h

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,27 @@
3737
#include <windows.h>
3838
#include <stdio.h> // for debug printf
3939
#include <GL/gl.h> // For OpenGL definitions
40-
#include <intrin.h> // for cpuid to test for SSE2
41-
#ifdef _M_ARM64
42-
#include <sse2neon.h> // for NEON
43-
#else
44-
#include <emmintrin.h> // for SSE2
45-
#include <tmmintrin.h> // for SSSE3
40+
41+
#if defined(_M_IX86) || defined(_M_X64)
42+
#include <intrin.h> // for cpuid to test for SSE2
43+
#include <emmintrin.h> // for SSE2
44+
#include <tmmintrin.h> // for SSSE3
45+
#if defined(__GNUC__) || (defined(__clang__) && defined(_MSC_VER))
46+
#define SPOUT_TARGET_SSE2 __attribute__((__target__("sse2")))
47+
#define SPOUT_TARGET_SSSE3 __attribute__((__target__("ssse3")))
48+
#endif
49+
#elif defined(_M_ARM64)
50+
#include <sse2neon.h> // for NEON
51+
#endif
52+
53+
#ifndef SPOUT_TARGET_SSE2
54+
#define SPOUT_TARGET_SSE2
4655
#endif
56+
57+
#ifndef SPOUT_TARGET_SSSE3
58+
#define SPOUT_TARGET_SSSE3
59+
#endif
60+
4761
#include <cmath> // For compatibility with Clang. PR#81
4862
#include <stdint.h> // for _uint32 etc
4963

@@ -141,11 +155,11 @@ class SPOUT_DLLEXP spoutCopy {
141155
unsigned int destWidth, unsigned int destHeight, bool bInvert = false) const;
142156

143157
//
144-
// SSE3 function
158+
// SSSE3 function
145159
//
146160
// RGBA to RGB/BGR with source line pitch
147161
//
148-
void rgba_to_rgb_sse3(const void* rgba_source, void* rgb_dest,
162+
void rgba_to_rgb_ssse3(const void* rgba_source, void* rgb_dest,
149163
unsigned int width, unsigned int height,
150164
unsigned int rgba_pitch, // line byte pitch
151165
bool bInvert = false, // Flip image
@@ -188,7 +202,7 @@ class SPOUT_DLLEXP spoutCopy {
188202
// Single line
189203
void rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, void* bgrx_out) const;
190204
// Full height
191-
void rgb_to_bgra_sse3(void* rgb_source, void* rgba_dest, unsigned int width, unsigned int height) const;
205+
void rgb_to_bgra_ssse3(void* rgb_source, void* rgba_dest, unsigned int width, unsigned int height) const;
192206

193207

194208
// Copy BGR to BGRA
@@ -212,7 +226,7 @@ class SPOUT_DLLEXP spoutCopy {
212226

213227
void rgba_bgra(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
214228
void rgba_bgra_sse2(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
215-
void rgba_bgra_sse3(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
229+
void rgba_bgra_ssse3(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
216230

217231
};
218232

SPOUTSDK/SpoutDirectX/SpoutDX/Tutorial04_Lib/include/SpoutSenderNames.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@
4646
#include <string>
4747
#include <vector>
4848
#include <unordered_map>
49+
#if defined(_M_IX86) || defined(_M_X64)
4950
#include <intrin.h> // for __movsd
51+
#endif
5052
#include <stdint.h> // for _uint32
5153
#include <assert.h>
5254
#ifdef _M_ARM64

SPOUTSDK/SpoutGL/SpoutCopy.cpp

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ void spoutCopy::ClearAlpha(unsigned char* src, unsigned int width, unsigned int
285285
//---------------------------------------------------------
286286
// Function: memcpy_sse2
287287
// SSE2 version of memcpy
288-
void spoutCopy::memcpy_sse2(void* dst, const void* src, size_t Size) const
288+
SPOUT_TARGET_SSE2 void spoutCopy::memcpy_sse2(void* dst, const void* src, size_t Size) const
289289
{
290290

291291
if (!dst || !src)
@@ -479,8 +479,8 @@ void spoutCopy::rgba2bgra(const void* rgba_source, void* bgra_dest,
479479
return;
480480

481481
if ((width % 16) == 0) { // 16 byte aligned width
482-
if (m_bSSE2 && m_bSSSE3) // SSE3 available
483-
rgba_bgra_sse3(rgba_source, bgra_dest, width, height, bInvert);
482+
if (m_bSSE2 && m_bSSSE3) // SSSE3 available
483+
rgba_bgra_ssse3(rgba_source, bgra_dest, width, height, bInvert);
484484
else if (m_bSSE2) // SSE2 available
485485
rgba_bgra_sse2(rgba_source, bgra_dest, width, height, bInvert);
486486
}
@@ -525,8 +525,8 @@ void spoutCopy::rgba2bgra(const void *rgba_source, void *bgra_dest,
525525

526526
// Copy the line
527527
if ((width % 16) == 0) { // 16 byte aligned width
528-
if (m_bSSE2 && m_bSSSE3) // SSE3 available
529-
rgba_bgra_sse3(source, dest, width, 1); // invert flag false
528+
if (m_bSSE2 && m_bSSSE3) // SSSE3 available
529+
rgba_bgra_ssse3(source, dest, width, 1); // invert flag false
530530
else if (m_bSSE2) // SSE2 available
531531
rgba_bgra_sse2(source, dest, width, 1);
532532
}
@@ -570,8 +570,8 @@ void spoutCopy::rgba2bgra(const void* rgba_source, void* bgra_dest,
570570
}
571571
// Copy the line
572572
if ((width % 16) == 0) { // 16 byte aligned width
573-
if (m_bSSE2 && m_bSSSE3) // SSE3 available
574-
rgba_bgra_sse3(source, dest, width, 1); // invert flag false
573+
if (m_bSSE2 && m_bSSSE3) // SSSE3 available
574+
rgba_bgra_ssse3(source, dest, width, 1); // invert flag false
575575
else if (m_bSSE2) // SSE2 available
576576
rgba_bgra_sse2(source, dest, width, 1);
577577
}
@@ -610,8 +610,8 @@ void spoutCopy::rgba2rgb(const void* rgba_source, void* rgb_dest,
610610
return;
611611

612612
//
613-
// SSE3 copy
614-
// No mirror option, image size 16 bit byte aligned, SSE3 intrinsics support
613+
// SSSE3 copy
614+
// No mirror option, image size 16 bit byte aligned, SSSE3 intrinsics support
615615
//
616616
// Timing tests show more than twice as fast
617617
// (Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz)
@@ -627,8 +627,8 @@ void spoutCopy::rgba2rgb(const void* rgba_source, void* rgb_dest,
627627
//
628628
unsigned int pitch = rgba_pitch;
629629
if(pitch == 0) pitch = width*4;
630-
if (!bMirror && width >= 320 && (width % 16) == 0 && m_bSSE3) {
631-
rgba_to_rgb_sse3(rgba_source, rgb_dest, width, height, pitch, bInvert, bSwapRB);
630+
if (!bMirror && width >= 320 && (width % 16) == 0 && m_bSSSE3) {
631+
rgba_to_rgb_ssse3(rgba_source, rgb_dest, width, height, pitch, bInvert, bSwapRB);
632632
return;
633633
}
634634

@@ -929,7 +929,7 @@ void spoutCopy::rgb2bgra(const void *rgb_source, void *bgra_dest,
929929
// Function: rgb_to_bgrx_sse
930930
// Experimental pending testing
931931
// Single line function
932-
void spoutCopy::rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, void* bgrx_dest) const
932+
SPOUT_TARGET_SSSE3 void spoutCopy::rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, void* bgrx_dest) const
933933
{
934934
const __m128i* in_vec = static_cast<const __m128i*>(rgb_source);
935935
__m128i* out_vec = static_cast<__m128i*>(bgrx_dest);
@@ -1003,7 +1003,7 @@ void spoutCopy::rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, vo
10031003
// Function: rgb_to_bgra_sse
10041004
// Experimental pending testing
10051005
// Full image height
1006-
void spoutCopy::rgb_to_bgra_sse3 (
1006+
void spoutCopy::rgb_to_bgra_ssse3 (
10071007
void* rgb_source,
10081008
void* rgba_dest,
10091009
unsigned int width,
@@ -1023,16 +1023,16 @@ void spoutCopy::rgb_to_bgra_sse3 (
10231023
rgba += width*4;
10241024
}
10251025

1026-
} // end rgb_to_bgra_sse3
1026+
} // end rgb_to_bgra_ssse3
10271027

10281028
//
10291029
// =====================================================================================
10301030

10311031

10321032
//---------------------------------------------------------
1033-
// Function: rgba_to_rgb_sse3
1033+
// Function: rgba_to_rgb_ssse3
10341034
//
1035-
void spoutCopy::rgba_to_rgb_sse3(const void* rgba_source, void* rgb_dest,
1035+
SPOUT_TARGET_SSSE3 void spoutCopy::rgba_to_rgb_ssse3(const void* rgba_source, void* rgb_dest,
10361036
unsigned int width, unsigned int height, unsigned int rgba_pitch,
10371037
bool bInvert, bool bSwapRB) const
10381038
{
@@ -1640,7 +1640,7 @@ void spoutCopy::rgba_bgra(const void* rgba_source, void* bgra_dest,
16401640
//
16411641
// All instructions SSE2.
16421642
//
1643-
void spoutCopy::rgba_bgra_sse2(const void* rgba_source, void* bgra_dest, unsigned int width, unsigned int height, bool bInvert) const
1643+
SPOUT_TARGET_SSE2 void spoutCopy::rgba_bgra_sse2(const void* rgba_source, void* bgra_dest, unsigned int width, unsigned int height, bool bInvert) const
16441644
{
16451645

16461646
if (!rgba_source)
@@ -1702,13 +1702,13 @@ void spoutCopy::rgba_bgra_sse2(const void* rgba_source, void* bgra_dest, unsigne
17021702
} // end rgba_bgra_sse2
17031703

17041704
//
1705-
// Adapted from a Gist snippet by Aurélien Vallée (NewbiZ) http://newbiz.github.io/
1705+
// Adapted from a Gist snippet by Aur�lien Vall�e (NewbiZ) http://newbiz.github.io/
17061706
//
17071707
// https://gist.github.com/NewbiZ/5541524
17081708
//
17091709
// Approximately 15% faster than SSE2 function
17101710
//
1711-
void spoutCopy::rgba_bgra_sse3(const void* rgba_source, void* bgra_dest, unsigned int width, unsigned int height, bool bInvert) const
1711+
SPOUT_TARGET_SSSE3 void spoutCopy::rgba_bgra_ssse3(const void* rgba_source, void* bgra_dest, unsigned int width, unsigned int height, bool bInvert) const
17121712
{
17131713
// Shuffling mask (RGBA -> BGRA) x 4, in reverse byte order
17141714
static const __m128i m = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);
@@ -1757,11 +1757,11 @@ void spoutCopy::rgba_bgra_sse3(const void* rgba_source, void* bgra_dest, unsigne
17571757
}
17581758
}
17591759

1760-
} // end rgba_bgra_sse3
1760+
} // end rgba_bgra_ssse3
17611761

17621762

17631763
// Swap red and blue components in place
1764-
void spoutCopy::rgba_swap_ssse3(void* __restrict rgba_source, unsigned int width, unsigned int height)
1764+
SPOUT_TARGET_SSSE3 void spoutCopy::rgba_swap_ssse3(void* __restrict rgba_source, unsigned int width, unsigned int height)
17651765
{
17661766
// Shuffling mask (RGBA -> BGRA) x 4, in reverse byte order (requires SSSE3)
17671767
static const __m128i mask = _mm_set_epi8(

SPOUTSDK/SpoutGL/SpoutCopy.h

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,27 @@
3737
#include <windows.h>
3838
#include <stdio.h> // for debug printf
3939
#include <GL/gl.h> // For OpenGL definitions
40-
#include <intrin.h> // for cpuid to test for SSE2
4140

42-
#ifdef _M_ARM64
43-
#include <sse2neon.h> // for NEON
44-
#else
45-
#include <emmintrin.h> // for SSE2
46-
#include <tmmintrin.h> // for SSSE3
41+
#if defined(_M_IX86) || defined(_M_X64)
42+
#include <intrin.h> // for cpuid to test for SSE2
43+
#include <emmintrin.h> // for SSE2
44+
#include <tmmintrin.h> // for SSSE3
45+
#if defined(__GNUC__) || (defined(__clang__) && defined(_MSC_VER))
46+
#define SPOUT_TARGET_SSE2 __attribute__((__target__("sse2")))
47+
#define SPOUT_TARGET_SSSE3 __attribute__((__target__("ssse3")))
48+
#endif
49+
#elif defined(_M_ARM64)
50+
#include <sse2neon.h> // for NEON
4751
#endif
52+
53+
#ifndef SPOUT_TARGET_SSE2
54+
#define SPOUT_TARGET_SSE2
55+
#endif
56+
57+
#ifndef SPOUT_TARGET_SSSE3
58+
#define SPOUT_TARGET_SSSE3
59+
#endif
60+
4861
#include <cmath> // For compatibility with Clang. PR#81
4962
#include <stdint.h> // for _uint32 etc
5063

@@ -147,11 +160,11 @@ class SPOUT_DLLEXP spoutCopy {
147160
unsigned int destWidth, unsigned int destHeight, bool bInvert = false) const;
148161

149162
//
150-
// SSE3 function
163+
// SSSE3 function
151164
//
152165
// RGBA to RGB/BGR with source line pitch
153166
//
154-
void rgba_to_rgb_sse3(const void* rgba_source, void* rgb_dest,
167+
void rgba_to_rgb_ssse3(const void* rgba_source, void* rgb_dest,
155168
unsigned int width, unsigned int height,
156169
unsigned int rgba_pitch, // line byte pitch
157170
bool bInvert = false, // Flip image
@@ -194,7 +207,7 @@ class SPOUT_DLLEXP spoutCopy {
194207
// Single line
195208
void rgb_to_bgrx_sse(unsigned int npixels, const void* rgb_source, void* bgrx_out) const;
196209
// Full height
197-
void rgb_to_bgra_sse3(void* rgb_source, void* rgba_dest, unsigned int width, unsigned int height) const;
210+
void rgb_to_bgra_ssse3(void* rgb_source, void* rgba_dest, unsigned int width, unsigned int height) const;
198211

199212

200213
// Copy BGR to BGRA
@@ -227,7 +240,7 @@ class SPOUT_DLLEXP spoutCopy {
227240

228241
void rgba_bgra(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
229242
void rgba_bgra_sse2(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
230-
void rgba_bgra_sse3(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
243+
void rgba_bgra_ssse3(const void *rgba_source, void *bgra_dest, unsigned int width, unsigned int height, bool bInvert = false) const;
231244
// LJ DEBUG
232245
// void rgba_swap_ssse3(void* __restrict rgbasource, unsigned int width, unsigned int height);
233246

SPOUTSDK/SpoutGL/SpoutSenderNames.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@
4646
#include <string>
4747
#include <vector>
4848
#include <unordered_map>
49+
#if defined(_M_IX86) || defined(_M_X64)
4950
#include <intrin.h> // for __movsd
51+
#endif
5052
#include <stdint.h> // for _uint32
5153
#include <assert.h>
5254
#ifdef _M_ARM64

0 commit comments

Comments
 (0)