99#include "asan_util.h"
1010#include "shadow_mapping.h"
1111
12+ #define OPTNONE __attribute__((optnone))
13+
1214static const __constant uchar kAsanHeapLeftRedzoneMagic = (uchar )0xfa ;
1315static const __constant uint kAsanHeapLeftRedzoneMagicx4 = 0xfafafafaU ;
1416static const __constant ulong kAsanHeapLeftRedzoneMagicx8 = 0xfafafafafafafafaUL ;
@@ -30,8 +32,8 @@ extern ulong __ockl_devmem_request(ulong addr, ulong size);
3032#define AA (P ,V ) __opencl_atomic_fetch_add(P, V, memory_order_relaxed, memory_scope_device)
3133#define AN (P ,V ) __opencl_atomic_fetch_and(P, V, memory_order_relaxed, memory_scope_device)
3234#define AO (P ,V ) __opencl_atomic_fetch_or(P, V, memory_order_relaxed, memory_scope_device)
33- #define AX (P ,V ) __opencl_atomic_fetch_xor(P, V, memory_order_relaxed, memory_scope_device)
3435#define ACE (P ,E ,V ) __opencl_atomic_compare_exchange_strong(P, E, V, memory_order_relaxed, memory_order_relaxed, memory_scope_device)
36+ #define RF () __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent", "global")
3537#define ARF () __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent", "global")
3638
3739// An allocation
@@ -110,11 +112,6 @@ typedef struct heap_s {
110112 lifo_t la [NLA ]; // Storage for available slabs
111113} heap_t ;
112114
113- // Inhibit control flow optimizations
114- __attribute__((overloadable )) static int o0 (int x ) { int y ; __asm__ volatile ("" : "=v" (y ) : "0" (x )); return y ; }
115- __attribute__((overloadable )) static uint o0 (uint x ) { uint y ; __asm__ volatile ("" : "=v" (y ) : "0" (x )); return y ; }
116- __attribute__((overloadable )) static ulong o0 (ulong x ) { ulong y ; __asm__ volatile ("" : "=v" (y ) : "0" (x )); return y ; }
117-
118115// Overloads to broadcast the value held by the first active lane
119116// The result is known to be wave-uniform
120117static __attribute__((overloadable )) uint
@@ -198,13 +195,14 @@ slab_pause(void)
198195
199196
200197// Intended to be called from only one lane of a wave
198+ OPTNONE
201199NO_SANITIZE_ADDR
202200static void
203201put_free_slab (__global heap_t * hp , __global slab_t * sp )
204202{
205203 __global lifo_t * lp = LP (hp , AA (& hp -> wid , 1UL ));
206204
207- for (ulong i = 1 ;; ++ i ) {
205+ for (;; ) {
208206 ulong top = AL (& lp -> top );
209207 AS (& sp -> next , (ulong )slabptr (top ));
210208 if (ACE (& lp -> top , & top , addcnt ((ulong )sp , top ))) {
@@ -224,7 +222,7 @@ get_free_slab(__global heap_t *hp)
224222
225223 __global lifo_t * lp = LP (hp , AA (& hp -> rid , 1UL ));
226224
227- for (ulong i = 1 ;; ++ i ) {
225+ for (;; ) {
228226 ulong top = AL (& lp -> top );
229227 __global slab_t * sp = slabptr (top );
230228 if (sp ) {
@@ -236,6 +234,7 @@ get_free_slab(__global heap_t *hp)
236234 }
237235 slab_pause ();
238236 }
237+
239238}
240239
241240NO_SANITIZE_ADDR
@@ -268,6 +267,7 @@ slab_free(__global alloc_t *ap, ulong pc)
268267 unpublish_allocation (ap , pc );
269268 __global heap_t * hp = get_heap_ptr ();
270269 __global slab_t * sp = (__global slab_t * )ap -> sp ;
270+
271271 int go = 1 ;
272272 do {
273273 if (go ) {
@@ -277,9 +277,6 @@ slab_free(__global alloc_t *ap, ulong pc)
277277 if (aid == 0 ) {
278278 uint rb = AA (& sp -> rb , sz ) + sz ;
279279 if (rb == SLAB_BYTES - SLAB_HEADER_BYTES ) {
280- AO (& sp -> ap , (ulong )F_UNREADY );
281- ulong cs = (ulong )sp ;
282- ACE (& hp -> cs , & cs , 0UL );
283280 put_free_slab (hp , sp );
284281 }
285282 }
@@ -406,7 +403,8 @@ try_new_slab(__global heap_t *hp)
406403 __global slab_t * sp = obtain_new_slab (hp );
407404 if (sp ) {
408405 AS (& sp -> next , 0UL );
409- AS (& sp -> ap , (ulong )sp | (ulong )(F_POISON_PENDING | F_POISON_NEEDED | F_UNREADY ));
406+ AS (& sp -> rb , 0U );
407+ AS (& sp -> ap , (ulong )sp + (ulong )SLAB_HEADER_BYTES + (ulong )(F_UNREADY | F_POISON_PENDING | F_POISON_NEEDED ));
410408#if defined SLAB_IDENTITY
411409 AS (& sp -> sid , AA (& hp -> num_slab_allocations , 1UL ));
412410#else
@@ -429,11 +427,12 @@ new_slab_wait(__global heap_t *hp)
429427}
430428
431429// Called by a single workitem
430+ OPTNONE
432431NO_SANITIZE_ADDR
433432static __global slab_t *
434433get_current_slab (__global heap_t * hp )
435434{
436- for (ulong i = 1 ;; ++ i ) {
435+ for (;; ) {
437436 ulong cs = AL (& hp -> cs );
438437 if (cs )
439438 return (__global slab_t * )cs ;
@@ -485,6 +484,7 @@ poison_slab(__global slab_t *sp, int aid, int na)
485484
486485 for (int i = aid ; i < SLAB_BYTES / SHADOW_GRANULARITY / sizeof (ulong ); i += na )
487486 ssp [i ] = kAsanHeapLeftRedzoneMagicx8 ;
487+ RF ();
488488
489489 if (!aid )
490490 AN (& sp -> ap , ~(ulong )F_POISON_PENDING );
@@ -541,39 +541,47 @@ slab_malloc(ulong lsz, ulong pc)
541541
542542 ulong o = (ulong )__ockl_alisa_u32 (asz );
543543
544- ulong p = 0 ;
544+ ulong ap = 0 ;
545545 if (!aid )
546- p = AA (& cs -> ap , o );
547- p = first (p );
548-
549- if (!(p & (ulong )F_MASK )) {
550- if (p + o <= (ulong )cs + SLAB_BYTES ) {
551- ret = publish_allocation (p + o - asz , (ulong )cs , pc , asz , arz , usz );
552- go = 0 ;
553- } else {
554- if (!__ockl_activelane_u32 ()) {
555- ulong e = (ulong )cs ;
556- ACE (& hp -> cs , & e , 0UL );
557- }
558- if (p + o - asz < (ulong )cs + SLAB_BYTES ) {
559- uint unused = (uint )((ulong )cs + SLAB_BYTES - (p + o - asz ));
560- uint rb = AA (& cs -> rb , unused ) + unused ;
561- if (rb == SLAB_BYTES - SLAB_HEADER_BYTES ) {
562- AO (& cs -> ap , (ulong )F_UNREADY );
563- put_free_slab (hp , cs );
564- }
565- }
566- }
567- } else {
568- ulong newp = 0 ;
546+ ap = AL (& cs -> ap );
547+ ap = first (ap );
548+
549+ if (ap & (ulong )F_MASK ) {
550+ ulong p = 0 ;
569551 if (!aid )
570- newp = AN (& cs -> ap , ~(ulong )F_POISON_NEEDED );
571- newp = first (newp );
552+ p = AN (& cs -> ap , ~(ulong )F_POISON_NEEDED );
553+ p = first (p );
572554
573- if (newp & (ulong )F_POISON_NEEDED )
555+ if (p & (ulong )F_POISON_NEEDED )
574556 poison_slab (cs , aid , active_lane_count ());
575557 else
576558 slab_pause ();
559+ } else {
560+ ulong p = 0 ;
561+ if (!aid )
562+ p = AA (& cs -> ap , o );
563+ p = first (p );
564+
565+ if (!(p & (ulong )F_MASK )) {
566+ if (p + o <= (ulong )cs + SLAB_BYTES ) {
567+ ret = publish_allocation (p + o - asz , (ulong )cs , pc , asz , arz , usz );
568+ go = 0 ;
569+ } else {
570+ if (!__ockl_activelane_u32 ()) {
571+ ulong e = (ulong )cs ;
572+ ACE (& hp -> cs , & e , 0UL );
573+ AO (& cs -> ap , (ulong )F_UNREADY );
574+ }
575+ if (p + o - asz < (ulong )cs + SLAB_BYTES ) {
576+ uint unused = (uint )((ulong )cs + SLAB_BYTES - (p + o - asz ));
577+ uint rb = AA (& cs -> rb , unused ) + unused ;
578+ if (rb == SLAB_BYTES - SLAB_HEADER_BYTES ) {
579+ put_free_slab (hp , cs );
580+ }
581+ }
582+ }
583+ } else
584+ slab_pause ();
577585 }
578586 }
579587 } while (__ockl_wfany_i32 (go ));
0 commit comments