Skip to content

Commit d694430

Browse files
JonChesterfieldtstellar
authored andcommitted
[openmp] Annotate tmp variables with omp_thread_mem_alloc
Fixes miscompile of calls into ocml. Bug 51445. The stack variable `double __tmp` is moved to dynamically allocated shared memory by CGOpenMPRuntimeGPU. This is usually fine, but when the variable is passed to a function that is explicitly annotated address_space(5) then allocating the variable off-stack leads to a miscompile in the back end, which cannot decide to move the variable back to the stack from shared. This could be fixed by removing the AS(5) annotation from the math library or by explicitly marking the variables as thread_mem_alloc. The cast to AS(5) is still a no-op once IR is reached. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D107971 (cherry picked from commit dbd7bad)
1 parent 47bbdbe commit d694430

File tree

2 files changed

+54
-0
lines changed

2 files changed

+54
-0
lines changed

clang/lib/Headers/__clang_hip_math.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
#endif
2020
#include <limits.h>
2121
#include <stdint.h>
22+
#ifdef __OPENMP_AMDGCN__
23+
#include <omp.h>
24+
#endif
2225
#endif // !defined(__HIPCC_RTC__)
2326

2427
#pragma push_macro("__DEVICE__")
@@ -258,6 +261,9 @@ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
258261
__DEVICE__
259262
float frexpf(float __x, int *__nptr) {
260263
int __tmp;
264+
#ifdef __OPENMP_AMDGCN__
265+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
266+
#endif
261267
float __r =
262268
__ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
263269
*__nptr = __tmp;
@@ -343,6 +349,9 @@ long int lroundf(float __x) { return __ocml_round_f32(__x); }
343349
__DEVICE__
344350
float modff(float __x, float *__iptr) {
345351
float __tmp;
352+
#ifdef __OPENMP_AMDGCN__
353+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
354+
#endif
346355
float __r =
347356
__ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
348357
*__iptr = __tmp;
@@ -423,6 +432,9 @@ float remainderf(float __x, float __y) {
423432
__DEVICE__
424433
float remquof(float __x, float __y, int *__quo) {
425434
int __tmp;
435+
#ifdef __OPENMP_AMDGCN__
436+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
437+
#endif
426438
float __r = __ocml_remquo_f32(
427439
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
428440
*__quo = __tmp;
@@ -479,6 +491,9 @@ __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
479491
__DEVICE__
480492
void sincosf(float __x, float *__sinptr, float *__cosptr) {
481493
float __tmp;
494+
#ifdef __OPENMP_AMDGCN__
495+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
496+
#endif
482497
*__sinptr =
483498
__ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
484499
*__cosptr = __tmp;
@@ -487,6 +502,9 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
487502
__DEVICE__
488503
void sincospif(float __x, float *__sinptr, float *__cosptr) {
489504
float __tmp;
505+
#ifdef __OPENMP_AMDGCN__
506+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
507+
#endif
490508
*__sinptr = __ocml_sincospi_f32(
491509
__x, (__attribute__((address_space(5))) float *)&__tmp);
492510
*__cosptr = __tmp;
@@ -799,6 +817,9 @@ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
799817
__DEVICE__
800818
double frexp(double __x, int *__nptr) {
801819
int __tmp;
820+
#ifdef __OPENMP_AMDGCN__
821+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
822+
#endif
802823
double __r =
803824
__ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
804825
*__nptr = __tmp;
@@ -883,6 +904,9 @@ long int lround(double __x) { return __ocml_round_f64(__x); }
883904
__DEVICE__
884905
double modf(double __x, double *__iptr) {
885906
double __tmp;
907+
#ifdef __OPENMP_AMDGCN__
908+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
909+
#endif
886910
double __r =
887911
__ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
888912
*__iptr = __tmp;
@@ -971,6 +995,9 @@ double remainder(double __x, double __y) {
971995
__DEVICE__
972996
double remquo(double __x, double __y, int *__quo) {
973997
int __tmp;
998+
#ifdef __OPENMP_AMDGCN__
999+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
1000+
#endif
9741001
double __r = __ocml_remquo_f64(
9751002
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
9761003
*__quo = __tmp;
@@ -1029,6 +1056,9 @@ double sin(double __x) { return __ocml_sin_f64(__x); }
10291056
__DEVICE__
10301057
void sincos(double __x, double *__sinptr, double *__cosptr) {
10311058
double __tmp;
1059+
#ifdef __OPENMP_AMDGCN__
1060+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
1061+
#endif
10321062
*__sinptr = __ocml_sincos_f64(
10331063
__x, (__attribute__((address_space(5))) double *)&__tmp);
10341064
*__cosptr = __tmp;
@@ -1037,6 +1067,9 @@ void sincos(double __x, double *__sinptr, double *__cosptr) {
10371067
__DEVICE__
10381068
void sincospi(double __x, double *__sinptr, double *__cosptr) {
10391069
double __tmp;
1070+
#ifdef __OPENMP_AMDGCN__
1071+
#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
1072+
#endif
10401073
*__sinptr = __ocml_sincospi_f64(
10411074
__x, (__attribute__((address_space(5))) double *)&__tmp);
10421075
*__cosptr = __tmp;
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#ifndef __OMP_H
2+
#define __OMP_H
3+
4+
#if _OPENMP
5+
// Follows the pattern in interface.h
6+
// Clang sema checks this type carefully, needs to closely match that from omp.h
7+
typedef enum omp_allocator_handle_t {
8+
omp_null_allocator = 0,
9+
omp_default_mem_alloc = 1,
10+
omp_large_cap_mem_alloc = 2,
11+
omp_const_mem_alloc = 3,
12+
omp_high_bw_mem_alloc = 4,
13+
omp_low_lat_mem_alloc = 5,
14+
omp_cgroup_mem_alloc = 6,
15+
omp_pteam_mem_alloc = 7,
16+
omp_thread_mem_alloc = 8,
17+
KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
18+
} omp_allocator_handle_t;
19+
#endif
20+
21+
#endif

0 commit comments

Comments
 (0)