Skip to content

Commit a247da4

Browse files
authored
[libclc] update __clc_mem_fence: add MemorySemantic arg and use __builtin_amdgcn_fence for AMDGPU (#152275)
It is necessary to add MemorySemantic argument for AMDGPU which means the memory or address space to which the memory ordering is applied. The MemorySemantic is also necessary for implementing the SPIR-V MemoryBarrier instruction. Additionally, the implementation of __clc_mem_fence on Intel GPUs requires the MemorySemantic argument. Using __builtin_amdgcn_fence for AMDGPU is follow-up of #151446 (comment) llvm-diff shows no change to nvptx64--nvidiacl.bc.
1 parent 2ff7a4c commit a247da4

File tree

12 files changed

+108
-44
lines changed

12 files changed

+108
-44
lines changed

libclc/clc/include/clc/mem_fence/clc_mem_fence.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
1111

1212
#include <clc/internal/clc.h>
13+
#include <clc/mem_fence/clc_mem_semantic.h>
1314

14-
_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope,
15-
int memory_order);
15+
_CLC_OVERLOAD _CLC_DECL void
16+
__clc_mem_fence(int memory_scope, int memory_order,
17+
__CLC_MemorySemantics memory_semantics);
1618

1719
#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__
10+
#define __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__
11+
12+
// The memory or address space to which the memory ordering is applied.
13+
typedef enum __CLC_MemorySemantics {
14+
__CLC_MEMORY_PRIVATE = 1 << 0,
15+
__CLC_MEMORY_GLOBAL = 1 << 1,
16+
__CLC_MEMORY_CONSTANT = 1 << 2,
17+
__CLC_MEMORY_LOCAL = 1 << 3,
18+
__CLC_MEMORY_GENERIC = 1 << 4,
19+
} __CLC_MemorySemantics;
20+
21+
#endif // __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__

libclc/clc/include/clc/synchronization/clc_work_group_barrier.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
#define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
1111

1212
#include <clc/internal/clc.h>
13+
#include <clc/mem_fence/clc_mem_semantic.h>
1314

14-
_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope,
15-
int memory_order);
15+
_CLC_OVERLOAD _CLC_DECL void
16+
__clc_work_group_barrier(int memory_scope, int memory_order,
17+
__CLC_MemorySemantics memory_semantics);
1618

1719
#endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__

libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,50 @@
88

99
#include <clc/mem_fence/clc_mem_fence.h>
1010

11-
void __clc_amdgcn_s_waitcnt(unsigned flags);
11+
#define BUILTIN_FENCE_ORDER(memory_order, ...) \
12+
switch (memory_order) { \
13+
case __ATOMIC_ACQUIRE: \
14+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, __VA_ARGS__); \
15+
break; \
16+
case __ATOMIC_RELEASE: \
17+
__builtin_amdgcn_fence(__ATOMIC_RELEASE, __VA_ARGS__); \
18+
break; \
19+
case __ATOMIC_ACQ_REL: \
20+
__builtin_amdgcn_fence(__ATOMIC_ACQ_REL, __VA_ARGS__); \
21+
break; \
22+
case __ATOMIC_SEQ_CST: \
23+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, __VA_ARGS__); \
24+
break; \
25+
default: \
26+
__builtin_unreachable(); \
27+
} \
28+
break;
1229

13-
// s_waitcnt takes 16bit argument with a combined number of maximum allowed
14-
// pending operations:
15-
// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
16-
// [7] -- undefined
17-
// [6:4] -- exports, GDS, and mem write
18-
// [3:0] -- vector memory operations
30+
#define BUILTIN_FENCE(memory_scope, memory_order, ...) \
31+
switch (memory_scope) { \
32+
case __MEMORY_SCOPE_DEVICE: \
33+
BUILTIN_FENCE_ORDER(memory_order, "agent", ##__VA_ARGS__) \
34+
case __MEMORY_SCOPE_WRKGRP: \
35+
BUILTIN_FENCE_ORDER(memory_order, "workgroup", ##__VA_ARGS__) \
36+
case __MEMORY_SCOPE_WVFRNT: \
37+
BUILTIN_FENCE_ORDER(memory_order, "wavefront", ##__VA_ARGS__) \
38+
case __MEMORY_SCOPE_SINGLE: \
39+
BUILTIN_FENCE_ORDER(memory_order, "singlethread", ##__VA_ARGS__) \
40+
case __MEMORY_SCOPE_SYSTEM: \
41+
default: \
42+
BUILTIN_FENCE_ORDER(memory_order, "", ##__VA_ARGS__) \
43+
}
1944

20-
// Newer clang supports __builtin_amdgcn_s_waitcnt
21-
#if __clang_major__ >= 5
22-
#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
23-
#else
24-
#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
25-
_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
26-
#endif
27-
28-
_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
29-
int memory_order) {
30-
if (memory_scope & __MEMORY_SCOPE_DEVICE) {
31-
// scalar loads are counted with LGKM but we don't know whether
32-
// the compiler turned any loads to scalar
33-
__waitcnt(0);
34-
} else if (memory_scope & __MEMORY_SCOPE_WRKGRP)
35-
__waitcnt(0xff); // LGKM is [12:8]
45+
_CLC_OVERLOAD _CLC_DEF void
46+
__clc_mem_fence(int memory_scope, int memory_order,
47+
__CLC_MemorySemantics memory_semantics) {
48+
if (memory_semantics == __CLC_MEMORY_LOCAL) {
49+
BUILTIN_FENCE(memory_scope, memory_order, "local")
50+
} else if (memory_semantics == __CLC_MEMORY_GLOBAL) {
51+
BUILTIN_FENCE(memory_scope, memory_order, "global")
52+
} else if (memory_semantics == (__CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL)) {
53+
BUILTIN_FENCE(memory_scope, memory_order, "local", "global")
54+
} else {
55+
BUILTIN_FENCE(memory_scope, memory_order)
56+
}
3657
}
37-
#undef __waitcnt

libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
#include <clc/mem_fence/clc_mem_fence.h>
1010
#include <clc/synchronization/clc_work_group_barrier.h>
1111

12-
_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
13-
int memory_order) {
14-
__clc_mem_fence(memory_scope, memory_order);
12+
_CLC_OVERLOAD _CLC_DEF void
13+
__clc_work_group_barrier(int memory_scope, int memory_order,
14+
__CLC_MemorySemantics memory_semantics) {
15+
__clc_mem_fence(memory_scope, memory_order, memory_semantics);
1516
__builtin_amdgcn_s_barrier();
1617
}

libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88

99
#include <clc/mem_fence/clc_mem_fence.h>
1010

11-
_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
12-
int memory_order) {
11+
_CLC_OVERLOAD _CLC_DEF void
12+
__clc_mem_fence(int memory_scope, int memory_order,
13+
__CLC_MemorySemantics memory_semantics) {
1314
if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP))
1415
__nvvm_membar_cta();
1516
}

libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99
#include <clc/synchronization/clc_work_group_barrier.h>
1010

11-
_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
12-
int memory_order) {
11+
_CLC_OVERLOAD _CLC_DEF void
12+
__clc_work_group_barrier(int memory_scope, int memory_order,
13+
__CLC_MemorySemantics memory_semantics) {
1314
__syncthreads();
1415
}

libclc/opencl/include/clc/opencl/synchronization/utils.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
1111

1212
#include <clc/internal/clc.h>
13+
#include <clc/mem_fence/clc_mem_semantic.h>
1314
#include <clc/opencl/synchronization/cl_mem_fence_flags.h>
1415

15-
_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
16+
_CLC_INLINE int __opencl_get_memory_scope(cl_mem_fence_flags flag) {
1617
int memory_scope = 0;
1718
if (flag & CLK_GLOBAL_MEM_FENCE)
1819
memory_scope |= __MEMORY_SCOPE_DEVICE;
@@ -21,4 +22,15 @@ _CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
2122
return memory_scope;
2223
}
2324

25+
_CLC_INLINE __CLC_MemorySemantics
26+
__opencl_get_memory_semantics(cl_mem_fence_flags flag) {
27+
if ((flag & CLK_LOCAL_MEM_FENCE) && (flag & CLK_GLOBAL_MEM_FENCE))
28+
return __CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL;
29+
if (flag & CLK_LOCAL_MEM_FENCE)
30+
return __CLC_MEMORY_LOCAL;
31+
if (flag & CLK_GLOBAL_MEM_FENCE)
32+
return __CLC_MEMORY_GLOBAL;
33+
__builtin_unreachable();
34+
}
35+
2436
#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__

libclc/opencl/lib/amdgcn/mem_fence/fence.cl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
#include <clc/opencl/synchronization/utils.h>
1212

1313
_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
14-
int memory_scope = getCLCMemoryScope(flags);
14+
int memory_scope = __opencl_get_memory_scope(flags);
1515
int memory_order = __ATOMIC_SEQ_CST;
16-
__clc_mem_fence(memory_scope, memory_order);
16+
__CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags);
17+
__clc_mem_fence(memory_scope, memory_order, memory_semantics);
1718
}
1819

1920
// We don't have separate mechanism for read and write fences

libclc/opencl/lib/amdgcn/synchronization/barrier.cl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
#include <clc/synchronization/clc_work_group_barrier.h>
1212

1313
_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
14-
int memory_scope = getCLCMemoryScope(flags);
14+
int memory_scope = __opencl_get_memory_scope(flags);
1515
int memory_order = __ATOMIC_SEQ_CST;
16-
__clc_work_group_barrier(memory_scope, memory_order);
16+
__CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags);
17+
__clc_work_group_barrier(memory_scope, memory_order, memory_semantics);
1718
}

0 commit comments

Comments
 (0)