88
99#include <clc/clc.h>
1010
11- #define VLOAD_VECTORIZE (PRIM_TYPE , ADDR_SPACE ) \
12- typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
13- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
14- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \
15- } \
16- \
17- typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
18- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
19- PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \
20- return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \
21- } \
22- \
23- typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
24- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
25- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \
26- } \
27- \
28- typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
29- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
30- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \
31- } \
32- \
33- typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
34- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
35- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \
36- } \
11+ #define VLOAD_VECTORIZE (PRIM_TYPE , ADDR_SPACE ) \
12+ typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
13+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
14+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \
15+ const ADDR_SPACE PRIM_TYPE *x) { \
16+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
17+ *)(&x[2 * offset])); \
18+ } \
19+ \
20+ typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \
21+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
22+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \
23+ const ADDR_SPACE PRIM_TYPE *x) { \
24+ PRIM_TYPE##2 vec = \
25+ *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
26+ *)(&x[3 * offset])); \
27+ return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \
28+ } \
29+ \
30+ typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
31+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
32+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \
33+ const ADDR_SPACE PRIM_TYPE *x) { \
34+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
35+ *)(&x[4 * offset])); \
36+ } \
37+ \
38+ typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
39+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
40+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \
41+ const ADDR_SPACE PRIM_TYPE *x) { \
42+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
43+ *)(&x[8 * offset])); \
44+ } \
45+ \
46+ typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
47+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
48+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \
49+ size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
50+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
51+ *)(&x[16 * offset])); \
52+ }
3753
38- #define VLOAD_ADDR_SPACES (__CLC_SCALAR_GENTYPE ) \
39- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
40- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
41- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
42- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
54+ #define VLOAD_ADDR_SPACES (__CLC_SCALAR_GENTYPE ) \
55+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
56+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
57+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
58+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)
4359
44- #define VLOAD_TYPES () \
45- VLOAD_ADDR_SPACES(char) \
46- VLOAD_ADDR_SPACES(uchar) \
47- VLOAD_ADDR_SPACES(short) \
48- VLOAD_ADDR_SPACES(ushort) \
49- VLOAD_ADDR_SPACES(int) \
50- VLOAD_ADDR_SPACES(uint) \
51- VLOAD_ADDR_SPACES(long) \
52- VLOAD_ADDR_SPACES(ulong) \
53- VLOAD_ADDR_SPACES(float) \
60+ #define VLOAD_TYPES () \
61+ VLOAD_ADDR_SPACES(char) \
62+ VLOAD_ADDR_SPACES(uchar) \
63+ VLOAD_ADDR_SPACES(short) \
64+ VLOAD_ADDR_SPACES(ushort) \
65+ VLOAD_ADDR_SPACES(int) \
66+ VLOAD_ADDR_SPACES(uint) \
67+ VLOAD_ADDR_SPACES(long) \
68+ VLOAD_ADDR_SPACES(ulong) \
69+ VLOAD_ADDR_SPACES(float)
5470
5571VLOAD_TYPES ()
5672
5773#ifdef cl_khr_fp64
5874#pragma OPENCL EXTENSION cl_khr_fp64 : enable
59- VLOAD_ADDR_SPACES (double )
75+ VLOAD_ADDR_SPACES (double )
6076#endif
6177#ifdef cl_khr_fp16
6278#pragma OPENCL EXTENSION cl_khr_fp16 : enable
63- VLOAD_ADDR_SPACES (half )
79+ VLOAD_ADDR_SPACES (half )
6480#endif
6581
6682/* vload_half are legal even without cl_khr_fp16 */
@@ -71,43 +87,45 @@ float __clc_vload_half_float_helper__global(const __global half *);
7187float __clc_vload_half_float_helper__local (const __local half * );
7288float __clc_vload_half_float_helper__private (const __private half * );
7389
74- #define VEC_LOAD1 (val , AS ) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
90+ #define VEC_LOAD1 (val , AS ) \
91+ val = __clc_vload_half_float_helper##AS(&mem[offset++]);
7592#else
7693#define VEC_LOAD1 (val , AS ) val = __builtin_load_halff(&mem[offset++]);
7794#endif
7895
79- #define VEC_LOAD2 (val , AS ) \
80- VEC_LOAD1(val.lo, AS) \
81- VEC_LOAD1(val.hi, AS)
82- #define VEC_LOAD3 (val , AS ) \
83- VEC_LOAD1(val.s0, AS) \
84- VEC_LOAD1(val.s1, AS) \
85- VEC_LOAD1(val.s2, AS)
86- #define VEC_LOAD4 (val , AS ) \
87- VEC_LOAD2(val.lo, AS) \
88- VEC_LOAD2(val.hi, AS)
89- #define VEC_LOAD8 (val , AS ) \
90- VEC_LOAD4(val.lo, AS) \
91- VEC_LOAD4(val.hi, AS)
92- #define VEC_LOAD16 (val , AS ) \
93- VEC_LOAD8(val.lo, AS) \
94- VEC_LOAD8(val.hi, AS)
96+ #define VEC_LOAD2 (val , AS ) \
97+ VEC_LOAD1(val.lo, AS) \
98+ VEC_LOAD1(val.hi, AS)
99+ #define VEC_LOAD3 (val , AS ) \
100+ VEC_LOAD1(val.s0, AS) \
101+ VEC_LOAD1(val.s1, AS) \
102+ VEC_LOAD1(val.s2, AS)
103+ #define VEC_LOAD4 (val , AS ) \
104+ VEC_LOAD2(val.lo, AS) \
105+ VEC_LOAD2(val.hi, AS)
106+ #define VEC_LOAD8 (val , AS ) \
107+ VEC_LOAD4(val.lo, AS) \
108+ VEC_LOAD4(val.hi, AS)
109+ #define VEC_LOAD16 (val , AS ) \
110+ VEC_LOAD8(val.lo, AS) \
111+ VEC_LOAD8(val.hi, AS)
95112
96- #define __FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
97- _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
98- offset *= VEC_SIZE; \
99- TYPE __tmp; \
100- VEC_LOAD##VEC_SIZE( __tmp, AS) \
101- return __tmp; \
102- } \
103- _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half *mem) { \
104- offset *= OFFSET_SIZE; \
105- TYPE __tmp; \
106- VEC_LOAD##VEC_SIZE( __tmp, AS) \
107- return __tmp; \
113+ #define __FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
114+ _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \
115+ const AS half *mem) { \
116+ offset *= VEC_SIZE; \
117+ TYPE __tmp; \
118+ VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
119+ } \
120+ _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \
121+ const AS half *mem) { \
122+ offset *= OFFSET_SIZE; \
123+ TYPE __tmp; \
124+ VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
108125 }
109126
110- #define FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
127+ #define FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
128+ __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
111129
112130#define __CLC_BODY "vload_half.inc"
113131#include <clc/math/gentype.inc>
0 commit comments