Skip to content

Commit 309edb4

Browse files
committed
[device-libs] Replace __memcpy_internal_aligned with __builtin_memcpy
Microbenchmarks on gfx1030 showed that __builtin_memcpy usually performs similarly to or better than __memcpy_internal_aligned for compile-time-constant copy sizes as they occur at the __memcpy_internal_aligned call sites once the pipe read/write functions are inlined. SWDEV-150027
1 parent 9c965bd commit 309edb4

File tree

4 files changed

+21
-61
lines changed

4 files changed

+21
-61
lines changed

amd/device-libs/opencl/src/pipes/memcpyia.cl

Lines changed: 0 additions & 55 deletions
This file was deleted.

amd/device-libs/opencl/src/pipes/pipes.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ F(32,ulong4) \
2222
F(64,ulong8) \
2323
F(128,ulong16)
2424

25+
// Assume that ptr is aligned by at least align bytes. In contrast to
26+
// __builtin_assume_aligned, this allows a non-constant alignment operand.
27+
#define ASSUME_ALIGNED(ptr, align) \
28+
__builtin_assume(__builtin_is_aligned(ptr, align))
29+
2530
struct pipeimp {
2631
atomic_size_t read_idx;
2732
atomic_size_t write_idx;
@@ -30,8 +35,6 @@ struct pipeimp {
3035
uchar packets[1];
3136
};
3237

33-
extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
34-
3538
static __attribute__((always_inline)) size_t
3639
reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n)
3740
{

amd/device-libs/opencl/src/pipes/readp.cl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ __read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align)
4040
return -1;
4141

4242
size_t pi = wrap(ri, p->end_idx);
43-
__memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
43+
void *pipe_ptr = p->packets + pi * size;
44+
ASSUME_ALIGNED(ptr, align);
45+
ASSUME_ALIGNED(pipe_ptr, align);
46+
__builtin_memcpy(ptr, pipe_ptr, size);
4447

4548
if (ri == wi-1) {
4649
__opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
@@ -68,7 +71,10 @@ __read_pipe_4(__global struct pipeimp* p, reserve_id_t rid, uint i, void *ptr, u
6871
{
6972
size_t rin = __builtin_astype(rid, size_t) + i; \
7073
size_t pi = wrap(rin, p->end_idx);
71-
__memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
74+
void *pipe_ptr = p->packets + pi * size;
75+
ASSUME_ALIGNED(ptr, align);
76+
ASSUME_ALIGNED(pipe_ptr, align);
77+
__builtin_memcpy(ptr, pipe_ptr, size);
7278

7379
return 0;
7480
}

amd/device-libs/opencl/src/pipes/writep.cl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ __write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint alig
3636
return -1;
3737

3838
size_t pi = wrap(wi, ei);
39-
__memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
39+
void *pipe_ptr = p->packets + pi * size;
40+
ASSUME_ALIGNED(pipe_ptr, align);
41+
ASSUME_ALIGNED(ptr, align);
42+
__builtin_memcpy(pipe_ptr, ptr, size);
4043

4144
return 0;
4245
}
@@ -58,7 +61,10 @@ __write_pipe_4(__global struct pipeimp* p, reserve_id_t rid, uint i, const void
5861
{
5962
size_t rin = __builtin_astype(rid, size_t) + i; \
6063
size_t pi = wrap(rin, p->end_idx);
61-
__memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
64+
void *pipe_ptr = p->packets + pi * size;
65+
ASSUME_ALIGNED(pipe_ptr, align);
66+
ASSUME_ALIGNED(ptr, align);
67+
__builtin_memcpy(pipe_ptr, ptr, size);
6268

6369
return 0;
6470
}

0 commit comments

Comments
 (0)