Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ void c_dbcsr_acc_opencl_configure(void) {
# endif
# if defined(ACC_OPENCL_XHINTS)
const char* const env_xhints = (ACC_OPENCL_XHINTS);
const int xhints_default = 1 + 2 + 4 + 8;
const int xhints_default = 1 + 2 + 4 + 8 + 16;
# else
const char* const env_xhints = NULL;
const int xhints_default = 0;
Expand Down Expand Up @@ -295,7 +295,7 @@ void c_dbcsr_acc_opencl_configure(void) {
if ((1 & c_dbcsr_acc_opencl_config.wa) && NULL == getenv("ZE_FLAT_DEVICE_HIERARCHY")) {
ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(apply[0]));
}
# if (0 == ACC_OPENCL_USM)
# if (1 >= ACC_OPENCL_USM)
if ((2 & c_dbcsr_acc_opencl_config.wa) && NULL == getenv("EnableRecoverablePageFaults")) {
ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(apply[1]));
}
Expand Down Expand Up @@ -605,11 +605,14 @@ int c_dbcsr_acc_init(void) {
# endif
0 > c_dbcsr_acc_opencl_config.profile)
{
const char* const env_qsize = getenv("ACC_OPENCL_PROFILE_QSIZE");
const int psize = (NULL == env_qsize ? 0 : atoi(env_qsize));
const int qsize = (0 >= psize ? 1024 : LIBXSMM_MIN(psize, 65536));
const int profile = LIBXSMM_MAX(LIBXSMM_ABS(c_dbcsr_acc_opencl_config.profile), 2);
const c_dbcsr_acc_opencl_hist_update_fn update[] = {c_dbcsr_acc_opencl_hist_avg, c_dbcsr_acc_opencl_hist_add};
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_h2d, profile + 1, profile * 4, 2, update);
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2h, profile + 1, profile * 4, 2, update);
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2d, profile + 1, profile * 4, 2, update);
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_h2d, profile + 1, qsize, 2, update);
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2h, profile + 1, qsize, 2, update);
c_dbcsr_acc_opencl_hist_create(&c_dbcsr_acc_opencl_config.hist_d2d, profile + 1, qsize, 2, update);
}
else {
assert(NULL == c_dbcsr_acc_opencl_config.hist_h2d);
Expand Down Expand Up @@ -679,7 +682,8 @@ LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) {
LIBXSMM_ATTRIBUTE_DTOR void c_dbcsr_acc_opencl_finalize(void) {
assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS);
if (0 != c_dbcsr_acc_opencl_config.ndevices) {
int precision[] = {0, 1}, i;
const int precision[] = {0, 1};
int i;
LIBXSMM_STDIO_ACQUIRE();
c_dbcsr_acc_opencl_hist_print(stderr, c_dbcsr_acc_opencl_config.hist_h2d, "\nPROF ACC/OpenCL: H2D", precision, NULL /*adjust*/);
c_dbcsr_acc_opencl_hist_print(stderr, c_dbcsr_acc_opencl_config.hist_d2h, "\nPROF ACC/OpenCL: D2H", precision, NULL /*adjust*/);
Expand Down Expand Up @@ -1151,7 +1155,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i
cl_platform_id platform = NULL;
cl_bitfield bitfield = 0;
if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *devinfo->std_level && 0 != devinfo->intel &&
0 == c_dbcsr_acc_opencl_config.profile && 0 == devinfo->unified &&
/*0 == c_dbcsr_acc_opencl_config.profile &&*/ (0 == devinfo->unified || NULL != (ACC_OPENCL_XHINTS)) &&
EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) &&
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) &&
EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield),
Expand Down
8 changes: 7 additions & 1 deletion src/acc/opencl/acc_opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,17 @@ typedef struct c_dbcsr_acc_opencl_device_t {
cl_int (*clMemFreeINTEL)(cl_context, void*);
} c_dbcsr_acc_opencl_device_t;

typedef enum c_dbcsr_acc_event_kind_t {
c_dbcsr_acc_event_kind_none,
c_dbcsr_acc_event_kind_h2d,
c_dbcsr_acc_event_kind_d2h,
c_dbcsr_acc_event_kind_d2d
} c_dbcsr_acc_event_kind_t;

/** Information about host/device-memory pointer. */
typedef struct c_dbcsr_acc_opencl_info_memptr_t {
cl_mem memory; /* first item! */
void* memptr;
/*void *data;*/
} c_dbcsr_acc_opencl_info_memptr_t;

/** Enumeration of FP-atomic kinds. */
Expand Down
2 changes: 1 addition & 1 deletion src/acc/opencl/acc_opencl_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ int c_dbcsr_acc_event_synchronize(void* event) { /* waits on the host-side */
if (NULL != clevent) {
if (0 == (32 & c_dbcsr_acc_opencl_config.wa)) {
cl_int status = CL_COMPLETE + 1;
if (32 & c_dbcsr_acc_opencl_config.xhints) {
if (64 & c_dbcsr_acc_opencl_config.xhints) {
result = clGetEventInfo(clevent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL);
assert(EXIT_SUCCESS == result || CL_COMPLETE != status);
}
Expand Down
70 changes: 35 additions & 35 deletions src/acc/opencl/acc_opencl_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
int memflags = CL_MEM_ALLOC_HOST_PTR;
nbytes += alignment + size_meminfo - 1;
# if defined(ACC_OPENCL_XHINTS)
if (0 != (4 & c_dbcsr_acc_opencl_config.xhints) && (0 != devinfo->nv || NULL != (ACC_OPENCL_XHINTS))) {
if (0 != (8 & c_dbcsr_acc_opencl_config.xhints) && (0 != devinfo->nv || NULL != (ACC_OPENCL_XHINTS))) {
host_ptr = ACC_OPENCL_MEM_ALLOC(nbytes, alignment);
if (NULL != host_ptr) memflags = CL_MEM_USE_HOST_PTR;
}
Expand All @@ -316,7 +316,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
if (NULL == host_ptr) {
mapped = clEnqueueMapBuffer(str->queue, memory, CL_TRUE /*always block*/,
# if defined(ACC_OPENCL_XHINTS) && (defined(CL_VERSION_1_2) || defined(CL_MAP_WRITE_INVALIDATE_REGION))
(16 & c_dbcsr_acc_opencl_config.xhints) ? CL_MAP_WRITE_INVALIDATE_REGION :
(32 & c_dbcsr_acc_opencl_config.xhints) ? CL_MAP_WRITE_INVALIDATE_REGION :
# endif
(CL_MAP_READ | CL_MAP_WRITE),
0 /*offset*/, nbytes, 0, NULL, NULL, &result);
Expand Down Expand Up @@ -396,33 +396,35 @@ void CL_CALLBACK c_dbcsr_acc_memcpy_notify(cl_event /*event*/, cl_int /*event_st
void CL_CALLBACK c_dbcsr_acc_memcpy_notify(cl_event event, cl_int event_status, void* data) {
int result = EXIT_SUCCESS;
const double durdev = c_dbcsr_acc_opencl_duration(event, &result);
c_dbcsr_acc_opencl_info_memptr_t info;
cl_command_type type;
size_t size = 0, offset = 0;
cl_command_type type = CL_COMMAND_SVM_MEMCPY;
LIBXSMM_UNUSED(event_status);
assert(CL_COMPLETE == event_status && NULL != data);
if (EXIT_SUCCESS == result && EXIT_SUCCESS == clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, NULL) &&
EXIT_SUCCESS == c_dbcsr_acc_opencl_info_devptr_lock(&info, NULL, data, 1 /*elsize*/, NULL /*amount*/, &offset) &&
EXIT_SUCCESS == clGetMemObjectInfo(info.memory, CL_MEM_SIZE, sizeof(size_t), &size, NULL) && offset <= size)
{
/*const double durhst = libxsmm_timer_duration((libxsmm_timer_tickint)info.data, libxsmm_timer_tick());
const double durtot = durdev - LIBXSMM_MIN(durdev, durhst);*/
const size_t amount = size - offset;
const double vals[] = {(double)amount, durdev};
const int mb = (int)((amount + (1 << 19)) >> 20);
assert(CL_COMPLETE == event_status && NULL != data && 8 == sizeof(data));
if (EXIT_SUCCESS == result && EXIT_SUCCESS == clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, NULL)) {
const size_t size = 0x3FFFFFFFFFFFFFFF & (size_t)data;
const int kind = (int)(((size_t)data) >> 62);
const double vals[] = {(double)size, durdev};
const int mb = (int)((size + (1 << 19)) >> 20);
if (CL_COMMAND_WRITE_BUFFER != type && CL_COMMAND_READ_BUFFER != type && CL_COMMAND_COPY_BUFFER != type) {
switch (kind) {
case c_dbcsr_acc_event_kind_h2d: type = CL_COMMAND_WRITE_BUFFER; break;
case c_dbcsr_acc_event_kind_d2h: type = CL_COMMAND_READ_BUFFER; break;
case c_dbcsr_acc_event_kind_d2d: type = CL_COMMAND_COPY_BUFFER; break;
default: assert(c_dbcsr_acc_event_kind_none == kind); /* should not happen */
}
}
switch (type) {
case CL_COMMAND_WRITE_BUFFER: {
assert(NULL != c_dbcsr_acc_opencl_config.hist_h2d);
assert(NULL != c_dbcsr_acc_opencl_config.hist_h2d && c_dbcsr_acc_event_kind_h2d == kind);
c_dbcsr_acc_opencl_hist_set(c_dbcsr_acc_opencl_config.lock_memory, c_dbcsr_acc_opencl_config.hist_h2d, vals);
if (0 > c_dbcsr_acc_opencl_config.profile) fprintf(stderr, "PROF ACC/OpenCL: H2D mb=%i us=%.0f\n", mb, durdev * 1E6);
} break;
case CL_COMMAND_READ_BUFFER: {
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2h);
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2h && c_dbcsr_acc_event_kind_d2h == kind);
c_dbcsr_acc_opencl_hist_set(c_dbcsr_acc_opencl_config.lock_memory, c_dbcsr_acc_opencl_config.hist_d2h, vals);
if (0 > c_dbcsr_acc_opencl_config.profile) fprintf(stderr, "PROF ACC/OpenCL: D2H mb=%i us=%.0f\n", mb, durdev * 1E6);
} break;
case CL_COMMAND_COPY_BUFFER: {
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2d);
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2d && c_dbcsr_acc_event_kind_d2d == kind);
c_dbcsr_acc_opencl_hist_set(c_dbcsr_acc_opencl_config.lock_memory, c_dbcsr_acc_opencl_config.hist_d2d, vals);
if (0 > c_dbcsr_acc_opencl_config.profile) fprintf(stderr, "PROF ACC/OpenCL: D2D mb=%i us=%.0f\n", mb, durdev * 1E6);
} break;
Expand Down Expand Up @@ -489,7 +491,7 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) {
{
# if defined(ACC_OPENCL_XHINTS)
const int devuid = devinfo->uid, devuids = (0x4905 == devuid || 0x020a == devuid || (0x0bd0 <= devuid && 0x0bdb >= devuid));
const int try_flag = ((0 != (8 & c_dbcsr_acc_opencl_config.xhints) && 0 != devinfo->intel && 0 == devinfo->unified &&
const int try_flag = ((0 != (16 & c_dbcsr_acc_opencl_config.xhints) && 0 != devinfo->intel && 0 == devinfo->unified &&
(devuids || NULL != (ACC_OPENCL_XHINTS)))
? (1u << 22)
: 0);
Expand Down Expand Up @@ -667,7 +669,8 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
assert(NULL != str);
# if (1 >= ACC_OPENCL_USM)
if (NULL != devinfo->clEnqueueMemcpyINTEL) {
result = devinfo->clEnqueueMemcpyINTEL(str->queue, finish, dev_mem, host_mem, nbytes, 0, NULL, NULL);
result = devinfo->clEnqueueMemcpyINTEL(
str->queue, finish, dev_mem, host_mem, nbytes, 0, NULL, NULL == c_dbcsr_acc_opencl_config.hist_h2d ? NULL : &event);
}
else
# endif
Expand All @@ -690,18 +693,18 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
if (NULL != info) {
result = clEnqueueWriteBuffer(str->queue, info->memory, finish, offset, nbytes, host_mem, 0, NULL,
NULL == c_dbcsr_acc_opencl_config.hist_h2d ? NULL : &event);
/*if (NULL != event && EXIT_SUCCESS == result) info->data = (void*)libxsmm_timer_tick();*/
}
else result = EXIT_FAILURE;
}
ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory);
if (NULL != event) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
if (EXIT_SUCCESS == result) {
void* const data = (void*)(nbytes | ((size_t)c_dbcsr_acc_event_kind_h2d) << 62);
assert(NULL != c_dbcsr_acc_opencl_config.hist_h2d);
if (!finish) { /* asynchronous */
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, dev_mem);
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, data);
}
else c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, dev_mem); /* synchronous */
else c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, data); /* synchronous */
}
else ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseEvent(event));
}
Expand Down Expand Up @@ -818,16 +821,16 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
else {
result = c_dbcsr_acc_opencl_memcpy_d2h(
info->memory, host_mem, offset, nbytes, str->queue, finish, NULL == c_dbcsr_acc_opencl_config.hist_d2h ? NULL : &event);
/*if (NULL != event && EXIT_SUCCESS == result) info->data = (void*)libxsmm_timer_tick();*/
}
ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory);
if (NULL != event) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
if (EXIT_SUCCESS == result) {
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2h /*&& NULL == c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL*/);
void* const data = (void*)(nbytes | ((size_t)c_dbcsr_acc_event_kind_d2h) << 62);
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2h);
if (!finish) { /* asynchronous */
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, nconst.ptr);
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, data);
}
else c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, nconst.ptr); /* synchronous */
else c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, data); /* synchronous */
}
else ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseEvent(event));
}
Expand Down Expand Up @@ -867,7 +870,8 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
assert(NULL != str && NULL != devinfo->context);
# if (1 >= ACC_OPENCL_USM)
if (NULL != devinfo->clEnqueueMemcpyINTEL) {
result = devinfo->clEnqueueMemcpyINTEL(str->queue, CL_FALSE /*blocking*/, devmem_dst, devmem_src, nbytes, 0, NULL, pevent);
result = devinfo->clEnqueueMemcpyINTEL(str->queue, CL_FALSE /*blocking*/, devmem_dst, devmem_src, nbytes, 0, NULL,
NULL == c_dbcsr_acc_opencl_config.hist_d2d ? pevent : &event);
}
else
# endif
Expand All @@ -892,26 +896,22 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
if (NULL != info_src && NULL != info_dst) {
result = clEnqueueCopyBuffer(str->queue, info_src->memory, info_dst->memory, offset_src, offset_dst, nbytes, 0, NULL,
NULL == c_dbcsr_acc_opencl_config.hist_d2d ? pevent : &event);
/*if (NULL != event && EXIT_SUCCESS == result && NULL != c_dbcsr_acc_opencl_config.hist_d2d) {
info_src->data = (void*)libxsmm_timer_tick();
}*/
}
else result = EXIT_FAILURE;
}
ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory);
if (NULL != event) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
if (EXIT_SUCCESS == result) {
void* const data = (void*)(nbytes | ((size_t)c_dbcsr_acc_event_kind_d2d) << 62);
if (NULL == pevent) { /* asynchronous */
assert(NULL == devinfo->clEnqueueMemcpyINTEL);
assert(NULL != c_dbcsr_acc_opencl_config.hist_d2d);
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, nconst.ptr);
result = clSetEventCallback(event, CL_COMPLETE, c_dbcsr_acc_memcpy_notify, data);
}
else { /* synchronous */
result = clWaitForEvents(1, &event);
if (EXIT_SUCCESS == result) {
if (NULL != c_dbcsr_acc_opencl_config.hist_d2d) {
assert(NULL == devinfo->clEnqueueMemcpyINTEL);
c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, nconst.ptr);
c_dbcsr_acc_memcpy_notify(event, CL_COMPLETE, data);
}
else result = clReleaseEvent(event);
}
Expand Down
19 changes: 11 additions & 8 deletions src/acc/opencl/acc_opencl_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,21 +122,29 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
# endif
{
const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[c_dbcsr_acc_opencl_config.device_id];
if (NULL != c_dbcsr_acc_opencl_config.hist_h2d || NULL != c_dbcsr_acc_opencl_config.hist_d2h ||
NULL != c_dbcsr_acc_opencl_config.hist_d2d)
{
properties[1] |= CL_QUEUE_PROFILING_ENABLE;
}
# if defined(ACC_OPENCL_XHINTS)
if ((2 & c_dbcsr_acc_opencl_config.xhints) && 0 != devinfo->intel) { /* enable queue families */
if ((2 & c_dbcsr_acc_opencl_config.xhints) && 0 != devinfo->intel) {
properties[1] |= (((ACC_OPENCL_STREAM_PROPERTIES_TYPE)1) << 31); /* CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL */
}
if ((4 & c_dbcsr_acc_opencl_config.xhints) && 0 != devinfo->intel) {
struct {
cl_command_queue_properties properties;
cl_bitfield capabilities;
cl_uint count;
char name[64 /*CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL*/];
} intel_qfprops[16];
const int j = (0 /*terminator*/ == properties[2] ? 2 : 4);
size_t nbytes = 0, i;
if (EXIT_SUCCESS == clGetDeviceInfo(device_id, 0x418B /*CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL*/, sizeof(intel_qfprops),
intel_qfprops, &nbytes))
{
{ /* enable queue families */
for (i = 0; (i * sizeof(*intel_qfprops)) < nbytes; ++i) {
if (0 /*CL_QUEUE_DEFAULT_CAPABILITIES_INTEL*/ == intel_qfprops[i].capabilities && 1 < intel_qfprops[i].count) {
const int j = (0 /*terminator*/ == properties[2] ? 2 : 4);
properties[j + 0] = 0x418C; /* CL_QUEUE_FAMILY_INTEL */
properties[j + 1] = (int)i;
properties[j + 2] = 0x418D; /* CL_QUEUE_INDEX_INTEL */
Expand All @@ -148,11 +156,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
}
}
# endif
if (NULL != c_dbcsr_acc_opencl_config.hist_h2d || NULL != c_dbcsr_acc_opencl_config.hist_d2h ||
NULL != c_dbcsr_acc_opencl_config.hist_d2d)
{
properties[1] = CL_QUEUE_PROFILING_ENABLE;
}
queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(devinfo->context, device_id, properties, &result);
}
if (EXIT_SUCCESS == result) { /* register stream */
Expand Down
Loading