Skip to content

Commit ad8e6f8

Browse files
authored
Creating oapv_mset_x128 avx function to replace memset and improve speed (#112)
* Adding oapv_mset_x128 avx function replacing memset Signed-off-by: subhrajitm20 <2003subhrajit@gmail.com> * removal of original definition of oapv_mset_x128 Signed-off-by: subhrajitm20 <2003subhrajit@gmail.com> * addition of function signature under X86_SSE Signed-off-by: subhrajitm20 <2003subhrajit@gmail.com> --------- Signed-off-by: subhrajitm20 <2003subhrajit@gmail.com>
1 parent 867ed70 commit ad8e6f8

File tree

2 files changed

+71
-35
lines changed

2 files changed

+71
-35
lines changed

src/oapv_port.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,32 @@ int oapv_get_num_cpu_cores(void)
121121
return num_cores;
122122
}
123123

124+
#if X86_SSE
125+
void *oapv_memset_x128_avx(void* dst, int value, size_t size) {
126+
uint8_t* ptr = (uint8_t*)dst;
127+
__m128i value_vec = _mm_set1_epi8((char)value); // 16-byte (128-bit) vector
128+
129+
size_t i = 0;
130+
// Store 128 units per iteration
131+
for(; i + 128 < size; i += 128) {
132+
_mm_store_si128((__m128i*)(ptr + 0), value_vec);
133+
_mm_store_si128((__m128i*)(ptr + 16), value_vec);
134+
_mm_store_si128((__m128i*)(ptr + 32), value_vec);
135+
_mm_store_si128((__m128i*)(ptr + 48), value_vec);
136+
_mm_store_si128((__m128i*)(ptr + 64), value_vec);
137+
_mm_store_si128((__m128i*)(ptr + 80), value_vec);
138+
_mm_store_si128((__m128i*)(ptr + 96), value_vec);
139+
_mm_store_si128((__m128i*)(ptr + 112), value_vec);
140+
}
141+
// Remaining full 16-unit blocks
142+
for (; i + 16 < size; i += 16) {
143+
_mm_store_si128((__m128i*)(ptr+i), value_vec);
144+
}
145+
146+
// Remaining tail
147+
for (; i < size; ++i) {
148+
ptr[i] = (uint8_t)value;
149+
}
150+
return dst;
151+
}
152+
#endif

src/oapv_port.h

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -67,41 +67,6 @@ typedef s32 dpel;
6767
#endif
6868
#endif
6969

70-
/*****************************************************************************
71-
* memory operations
72-
*****************************************************************************/
73-
#define oapv_malloc(size) malloc((size))
74-
#define oapv_malloc_fast(size) oapv_malloc((size))
75-
76-
#define oapv_mfree(m) \
77-
{ \
78-
if(m) { \
79-
free(m); \
80-
} \
81-
}
82-
#define oapv_mfree_fast(m) \
83-
{ \
84-
if(m) { \
85-
oapv_mfree(m); \
86-
} \
87-
}
88-
89-
void *oapv_malloc_align32(int size);
90-
void oapv_mfree_align32(void *p);
91-
92-
#define oapv_mcpy(dst, src, size) memcpy((dst), (src), (size))
93-
#define oapv_mset(dst, v, size) memset((dst), (v), (size))
94-
#define oapv_mset_x64a(dst, v, size) memset((dst), (v), (size))
95-
#define oapv_mset_x128(dst, v, size) memset((dst), (v), (size))
96-
#define oapv_mcmp(dst, src, size) memcmp((dst), (src), (size))
97-
98-
static __inline void oapv_mset_16b(s16 *dst, s16 v, int cnt)
99-
{
100-
int i;
101-
for(i = 0; i < cnt; i++)
102-
dst[i] = v;
103-
}
104-
10570
/*****************************************************************************
10671
* trace and assert
10772
*****************************************************************************/
@@ -188,6 +153,48 @@ void oapv_trace_line(char *pre);
188153
#include <arm_neon.h>
189154
#endif
190155

156+
/*****************************************************************************
157+
* memory operations
158+
*****************************************************************************/
159+
#define oapv_malloc(size) malloc((size))
160+
#define oapv_malloc_fast(size) oapv_malloc((size))
161+
162+
#define oapv_mfree(m) \
163+
{ \
164+
if(m) { \
165+
free(m); \
166+
} \
167+
}
168+
#define oapv_mfree_fast(m) \
169+
{ \
170+
if(m) { \
171+
oapv_mfree(m); \
172+
} \
173+
}
174+
175+
void *oapv_malloc_align32(int size);
176+
void oapv_mfree_align32(void *p);
177+
#if X86_SSE
178+
void *oapv_memset_x128_avx(void* dst, int value, size_t size);
179+
#endif
180+
181+
#define oapv_mcpy(dst, src, size) memcpy((dst), (src), (size))
182+
#define oapv_mset(dst, v, size) memset((dst), (v), (size))
183+
#define oapv_mset_x64a(dst, v, size) memset((dst), (v), (size))
184+
#if X86_SSE
185+
#define oapv_mset_x128(dst, v, size) oapv_memset_x128_avx((dst), (v), (size))
186+
#else
187+
#define oapv_mset_x128(dst, v, size) memset((dst), (v), (size))
188+
#endif
189+
#define oapv_mcmp(dst, src, size) memcmp((dst), (src), (size))
190+
191+
static __inline void oapv_mset_16b(s16 *dst, s16 v, int cnt)
192+
{
193+
int i;
194+
for(i = 0; i < cnt; i++)
195+
dst[i] = v;
196+
}
197+
191198
/* Buffer Alignement */
192199
#if defined(_WIN32) && !defined(__GNUC__)
193200
#define DECLARE_ALIGNED(var, n) __declspec(align(n)) var

0 commit comments

Comments
 (0)