|
8 | 8 |
|
9 | 9 | #include <clc/mem_fence/clc_mem_fence.h> |
10 | 10 |
|
11 | | -void __clc_amdgcn_s_waitcnt(unsigned flags); |
| 11 | +#define BUILTIN_FENCE_ORDER(memory_order, ...) \ |
| 12 | + switch (memory_order) { \ |
| 13 | + case __ATOMIC_ACQUIRE: \ |
| 14 | + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, __VA_ARGS__); \ |
| 15 | + break; \ |
| 16 | + case __ATOMIC_RELEASE: \ |
| 17 | + __builtin_amdgcn_fence(__ATOMIC_RELEASE, __VA_ARGS__); \ |
| 18 | + break; \ |
| 19 | + case __ATOMIC_ACQ_REL: \ |
| 20 | + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, __VA_ARGS__); \ |
| 21 | + break; \ |
| 22 | + case __ATOMIC_SEQ_CST: \ |
| 23 | + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, __VA_ARGS__); \ |
| 24 | + break; \ |
| 25 | + default: \ |
| 26 | + __builtin_unreachable(); \ |
| 27 | + } \ |
| 28 | + break; |
12 | 29 |
|
13 | | -// s_waitcnt takes 16bit argument with a combined number of maximum allowed |
14 | | -// pending operations: |
15 | | -// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages |
16 | | -// [7] -- undefined |
17 | | -// [6:4] -- exports, GDS, and mem write |
18 | | -// [3:0] -- vector memory operations |
| 30 | +#define BUILTIN_FENCE(memory_scope, memory_order, ...) \ |
| 31 | + switch (memory_scope) { \ |
| 32 | + case __MEMORY_SCOPE_DEVICE: \ |
| 33 | + BUILTIN_FENCE_ORDER(memory_order, "agent", ##__VA_ARGS__) \ |
| 34 | + case __MEMORY_SCOPE_WRKGRP: \ |
| 35 | + BUILTIN_FENCE_ORDER(memory_order, "workgroup", ##__VA_ARGS__) \ |
| 36 | + case __MEMORY_SCOPE_WVFRNT: \ |
| 37 | + BUILTIN_FENCE_ORDER(memory_order, "wavefront", ##__VA_ARGS__) \ |
| 38 | + case __MEMORY_SCOPE_SINGLE: \ |
| 39 | + BUILTIN_FENCE_ORDER(memory_order, "singlethread", ##__VA_ARGS__) \ |
| 40 | + case __MEMORY_SCOPE_SYSTEM: \ |
| 41 | + default: \ |
| 42 | + BUILTIN_FENCE_ORDER(memory_order, "", ##__VA_ARGS__) \ |
| 43 | + } |
19 | 44 |
|
20 | | -// Newer clang supports __builtin_amdgcn_s_waitcnt |
21 | | -#if __clang_major__ >= 5 |
22 | | -#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) |
23 | | -#else |
24 | | -#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) |
25 | | -_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); |
26 | | -#endif |
27 | | - |
28 | | -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, |
29 | | - int memory_order) { |
30 | | - if (memory_scope & __MEMORY_SCOPE_DEVICE) { |
31 | | - // scalar loads are counted with LGKM but we don't know whether |
32 | | - // the compiler turned any loads to scalar |
33 | | - __waitcnt(0); |
34 | | - } else if (memory_scope & __MEMORY_SCOPE_WRKGRP) |
35 | | - __waitcnt(0xff); // LGKM is [12:8] |
| 45 | +_CLC_OVERLOAD _CLC_DEF void |
| 46 | +__clc_mem_fence(int memory_scope, int memory_order, |
| 47 | + __CLC_MemorySemantics memory_semantics) { |
| 48 | + if (memory_semantics == __CLC_MEMORY_LOCAL) { |
| 49 | + BUILTIN_FENCE(memory_scope, memory_order, "local") |
| 50 | + } else if (memory_semantics == __CLC_MEMORY_GLOBAL) { |
| 51 | + BUILTIN_FENCE(memory_scope, memory_order, "global") |
| 52 | + } else if (memory_semantics == (__CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL)) { |
| 53 | + BUILTIN_FENCE(memory_scope, memory_order, "local", "global") |
| 54 | + } else { |
| 55 | + BUILTIN_FENCE(memory_scope, memory_order) |
| 56 | + } |
36 | 57 | } |
37 | | -#undef __waitcnt |
|
0 commit comments