Skip to content

Commit 8b188b1

Browse files
authored
Merge pull request #65 from oneapi-src/dev_lc0_interop_fix
[Lc0][SYCL][Nvidia] Updated interop calls and build system.
2 parents f6b03df + 8a90fad commit 8b188b1

File tree

3 files changed

+127
-48
lines changed

3 files changed

+127
-48
lines changed

lc0/meson.build

Lines changed: 87 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -264,20 +264,47 @@ if get_option('USE_SYCL')
264264
mlink_args = ['-fsycl']
265265
has_backends = true
266266
message('Building SYCL')
267-
add_project_arguments('-O3', language : 'cpp')
268-
add_project_arguments('-fsycl', language : 'cpp')
269-
add_project_arguments('-ffast-math', language : 'cpp')
270-
add_project_arguments('-fsycl-unnamed-lambda', language : 'cpp')
271-
add_project_arguments('-Wall', language : 'cpp')
272-
add_project_arguments('-Wextra', language : 'cpp')
273267

274268
files += 'src/neural/sycl/layers.cc.dp.cpp'
275269
files += 'src/neural/sycl/network_sycl.cc.dp.cpp'
276270
files += 'src/neural/sycl/common_kernels.dp.cpp'
277271

272+
273+
DEF_INTEL_GENERAL_CXX_FLAGS = ['-O3','-fsycl','-ffast-math','-fsycl-unnamed-lambda','-Wall', '-Wextra']
274+
DEF_INTEL_WL_CXX_FLAGS = ['-DDEFAULT_MINIBATCH_SIZE=248', '-DMKL_ILP64']
275+
DEF_AMD_GENERAL_CXX_FLAGS = ['-O3','-fsycl','-ffast-math','-fsycl-unnamed-lambda','-Wall', '-Wextra']
276+
DEF_AMD_WL_CXX_FLAGS = ['-DUSE_HIPBLAS', '-DINLINE', '-D__HIP_PLATFORM_AMD__']
277+
DEF_NVIDIA_GENERAL_CXX_FLAGS = ['-O3','-fsycl','-ffast-math','-fsycl-unnamed-lambda','-Wall', '-Wextra']
278+
DEF_NVIDIA_WL_CXX_FLAGS=['-DUSE_CUBLAS', '-DINLINE', '-DNVIDIABE']
279+
280+
281+
if(get_option('CMAKE_CXX_FLAGS') != [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') != [])
282+
message('Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together')
283+
elif(get_option('CMAKE_CXX_FLAGS')== [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') == [])
284+
message('Using DEFAULT compilation flags')
285+
INTEL_GPU_CXX_FLAGS = DEF_INTEL_GENERAL_CXX_FLAGS + DEF_INTEL_WL_CXX_FLAGS
286+
NVIDIA_GPU_CXX_FLAGS = DEF_NVIDIA_GENERAL_CXX_FLAGS + DEF_NVIDIA_WL_CXX_FLAGS
287+
AMD_GPU_CXX_FLAGS = DEF_AMD_GENERAL_CXX_FLAGS + DEF_AMD_WL_CXX_FLAGS
288+
elif(get_option('OVERRIDE_GENERAL_CXX_FLAGS') !=[])
289+
message('OVERRIDING GENERAL compilation flags')
290+
INTEL_GPU_CXX_FLAGS = get_option('OVERRIDE_GENERAL_CXX_FLAGS') + DEF_INTEL_WL_CXX_FLAGS
291+
NVIDIA_GPU_CXX_FLAGS = get_option('OVERRIDE_GENERAL_CXX_FLAGS') + DEF_NVIDIA_WL_CXX_FLAGS
292+
AMD_GPU_CXX_FLAGS = get_option('OVERRIDE_GENERAL_CXX_FLAGS') + DEF_AMD_WL_CXX_FLAGS
293+
elif(get_option('CMAKE_CXX_FLAGS') != [])
294+
message('OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags')
295+
INTEL_GPU_CXX_FLAGS = get_option('CMAKE_CXX_FLAGS')
296+
NVIDIA_GPU_CXX_FLAGS = get_option('CMAKE_CXX_FLAGS')
297+
AMD_GPU_CXX_FLAGS = get_option('CMAKE_CXX_FLAGS')
298+
endif
299+
300+
INTEL_GPU_CXX_FLAGS += [get_option('GPU_AOT')]
301+
NVIDIA_GPU_CXX_FLAGS += ['-fsycl-targets=nvidia_gpu_sm_' + get_option('USE_SM')]
302+
AMD_GPU_CXX_FLAGS += ['-fsycl-targets=amd_gpu_gfx' + get_option('USE_SM')]
303+
304+
278305
if(get_option('USE_L0_BACKEND') == true)
279306
message('Building SYCL for the L0 backend')
280-
add_project_arguments('-DMKL_ILP64', language : 'cpp')
307+
add_project_arguments(INTEL_GPU_CXX_FLAGS, language : 'cpp')
281308
deps += cc.find_library('sycl', required: true)
282309
deps += cc.find_library('mkl_sycl', required: true)
283310
deps += cc.find_library('mkl_intel_ilp64', required: true)
@@ -286,39 +313,48 @@ if get_option('USE_SYCL')
286313
deps += cc.find_library('OpenCL', required: true)
287314
deps += cc.find_library('dl', required: true)
288315
deps += cc.find_library('m', required: true)
289-
add_project_arguments('-DDEFAULT_MINIBATCH_SIZE=248', language : 'cpp')
290-
add_project_arguments(get_option('GPU_AOT'), language : 'cpp')
291-
mlink_args += get_option('GPU_AOT')
316+
mlink_args += INTEL_GPU_CXX_FLAGS
292317
elif (get_option('USE_AMD_BACKEND') == true)
293318
message('Building SYCL for AMD backend')
294-
sm_level = 'amd_gpu_' + get_option('USE_SM')
295-
add_project_arguments('-fsycl-targets=' + sm_level , language : 'cpp')
296-
add_project_arguments('-DUSE_HIPBLAS', language : 'cpp')
297-
add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
298-
add_project_arguments('-DINLINE', language : 'cpp')
319+
add_project_arguments(AMD_GPU_CXX_FLAGS, language : 'cpp')
299320
hip_blas = cc.find_library('hipblas', required: true)
300321
hip_dart = cc.find_library('amdhip64', required: true)
301322
deps += [hip_blas, hip_dart]
302323
deps += cc.find_library('sycl', required: true)
303-
mlink_args+= ['-fsycl', '-fsycl-targets=' + sm_level]
324+
mlink_args+= AMD_GPU_CXX_FLAGS
304325
else
305-
sm_level = 'nvidia_gpu_sm_' + get_option('USE_SM')
306326
message('Building SYCL for the NVIDIA backend')
307-
add_project_arguments('-fsycl-targets=' + sm_level, language : 'cpp')
308-
add_project_arguments('-DUSE_CUBLAS', language : 'cpp')
309-
add_project_arguments('-DINLINE', language : 'cpp')
310-
add_project_arguments('-DNVIDIABE', language : 'cpp')
327+
add_project_arguments(NVIDIA_GPU_CXX_FLAGS, language : 'cpp')
311328
cu_blas = cc.find_library('cublas', required: true)
312329
cu_dart = cc.find_library('cudart', required: true)
313-
deps += [cu_blas, cu_dart]
330+
cu_da = cc.find_library('cuda', required: true)
331+
deps += [cu_blas, cu_dart, cu_da]
314332
deps += cc.find_library('sycl', required: true)
315333
deps += cc.find_library('pthread', required: true)
316-
mlink_args+= ['-fsycl', '-fsycl-targets=' + sm_level]
334+
mlink_args+= NVIDIA_GPU_CXX_FLAGS
317335
endif
318-
319-
#message('Using link arguements ' + mlink_args)
320-
executable('lc0_sycl', 'src/main.cc', files, include_directories: includes, dependencies: deps, install: true, link_args : mlink_args)
336+
337+
executable('lc0_sycl', 'src/main.cc', files, include_directories: includes, dependencies: deps, install: true, link_args : mlink_args)
338+
321339
elif get_option('USE_CUDA')
340+
341+
DEF_WL_CXX_FLAGS = ['-Xcompiler', '-fPIC']
342+
DEF_GENERAL_CXX_FLAGS = ['-O2']
343+
DEF_COMBINED_CXX_FLAGS = DEF_WL_CXX_FLAGS + DEF_GENERAL_CXX_FLAGS
344+
345+
if(get_option('CMAKE_CXX_FLAGS') != [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') != [])
346+
message('Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together')
347+
elif(get_option('CMAKE_CXX_FLAGS')== [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') == [])
348+
message('Using DEFAULT compilation flags')
349+
CMAKE_CXX_FLAGS = DEF_COMBINED_CXX_FLAGS
350+
elif(get_option('OVERRIDE_GENERAL_CXX_FLAGS') !=[])
351+
message('OVERRIDING GENERAL compilation flags')
352+
CMAKE_CXX_FLAGS = get_option('OVERRIDE_GENERAL_CXX_FLAGS') + DEF_WL_CXX_FLAGS
353+
elif(get_option('CMAKE_CXX_FLAGS') != [])
354+
message('OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags')
355+
endif
356+
357+
322358
cudnn_libdirs = get_option('cudnn_libdirs')
323359
cu_blas = cc.find_library('cublas', dirs: cudnn_libdirs, required: false)
324360
cu_dnn = cc.find_library('cudnn', dirs: cudnn_libdirs, required: false)
@@ -356,7 +392,8 @@ elif get_option('USE_CUDA')
356392
cuda_arguments += ['-Xcompiler', '-MD']
357393
endif
358394
else
359-
cuda_arguments += ['--std=c++14', '-Xcompiler', '-fPIC']
395+
cuda_arguments += CMAKE_CXX_FLAGS
396+
#cuda_arguments += ['--std=c++14', '-Xcompiler', '-fPIC']
360397
endif
361398
if get_option('nvcc_ccbin') != ''
362399
cuda_arguments += ['-ccbin=' + get_option('nvcc_ccbin')]
@@ -374,6 +411,7 @@ elif get_option('USE_CUDA')
374411
else
375412
outputname = '@[email protected]'
376413
endif
414+
nvcc_extra_args += get_option('CUDA_NVCC_FLAGS')
377415
files += cuda_files
378416
files += custom_target('cuda fp32 code',
379417
input : 'src/neural/cuda/common_kernels.cu',
@@ -383,18 +421,19 @@ elif get_option('USE_CUDA')
383421
)
384422

385423
# Handling of fp16 cuda code.
386-
nvcc_arch = '-arch=compute_' + get_option('USE_SM')
424+
#nvcc_arch = '-arch=compute_' + get_option('USE_SM')
387425
nvcc_sm_list = ['sm_' + get_option('USE_SM')]
388426
# Ignore the given CC for fp16 when it is not in the supported list.
389427
if cuda_cc == '' or not nvcc_sm_list.contains('sm_' + cuda_cc)
390-
nvcc_extra_args = [nvcc_arch]
428+
nvcc_extra_args = []
391429
nvcc_help = run_command(nvcc, '-h').stdout()
392430
foreach x : nvcc_sm_list
393431
if nvcc_help.contains(x)
394-
nvcc_extra_args += '-code=' + x
432+
nvcc_extra_args += '-arch=' + x
395433
endif
396434
endforeach
397435
endif
436+
nvcc_extra_args += get_option('CUDA_NVCC_FLAGS')
398437
files += custom_target('cuda fp16 code',
399438
input : 'src/neural/cuda/fp16_kernels.cu',
400439
output : outputname,
@@ -411,8 +450,23 @@ elif get_option('USE_AMD')
411450
files += 'src/neural/amd/network_amd.cpp'
412451
files += 'src/neural/amd/common_kernels.cpp'
413452

414-
add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
415-
add_project_arguments('-O3', language : 'cpp')
453+
DEF_WL_CXX_FLAGS = ['-D__HIP_PLATFORM_AMD__']
454+
DEF_GENERAL_CXX_FLAGS = ['-O3']
455+
DEF_COMBINED_CXX_FLAGS = DEF_WL_CXX_FLAGS + DEF_GENERAL_CXX_FLAGS
456+
457+
if(get_option('CMAKE_CXX_FLAGS') != [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') != [])
458+
message('Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together')
459+
elif(get_option('CMAKE_CXX_FLAGS')== [] and get_option('OVERRIDE_GENERAL_CXX_FLAGS') == [])
460+
message('Using DEFAULT compilation flags')
461+
CMAKE_CXX_FLAGS = DEF_COMBINED_CXX_FLAGS
462+
elif(get_option('OVERRIDE_GENERAL_CXX_FLAGS') != [])
463+
message('OVERRIDING GENERAL compilation flags')
464+
CMAKE_CXX_FLAGS = get_option('OVERRIDE_GENERAL_CXX_FLAGS') + DEF_WL_CXX_FLAGS
465+
elif(get_option('CMAKE_CXX_FLAGS') != [])
466+
message('OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags')
467+
endif
468+
469+
add_project_arguments(CMAKE_CXX_FLAGS, language : 'cpp')
416470

417471
hip_blas_lib = cc.find_library('hipblas', required: true)
418472
hip_blas_runtime = cc.find_library('hipblas', required: true)
@@ -427,5 +481,6 @@ else
427481
endif
428482

429483

484+
430485

431486

lc0/meson_options.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,18 @@ option('onnx_include',
217217
type: 'string',
218218
value: '',
219219
description: 'Paths to ONNX runtime includes')
220+
221+
option('CMAKE_CXX_FLAGS',
222+
type: 'array',
223+
value: [],
224+
description: 'Override C++ compiler options used by nvcc, clang, and icx.')
225+
226+
option('OVERRIDE_GENERAL_CXX_FLAGS',
227+
type: 'array',
228+
value: [],
229+
description: 'Override C++ compiler general options used by nvcc, clang, and icx.')
230+
231+
option('CUDA_NVCC_FLAGS',
232+
type: 'array',
233+
value: [],
234+
description: 'Override general nvcc flags.')

lc0/src/neural/sycl/layers.cc.dp.cpp

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
275275

276276
cgh.host_task([=](sycl::interop_handle ih) {
277277

278-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
278+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
279+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
279280
cublasSetStream(handle, cudaStreamHandle);
280281

281282
ReportCUBLASErrors(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, numFc1Out_,
@@ -332,7 +333,8 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
332333

333334
cgh.host_task([=](sycl::interop_handle ih) {
334335

335-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
336+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
337+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
336338
cublasSetStream(handle, cudaStreamHandle);
337339

338340
ReportCUBLASErrors(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, 2 * C, N,
@@ -595,7 +597,8 @@ void FCLayer<float>::Eval(int N, float* output_tensor,
595597

596598
cgh.host_task([=](sycl::interop_handle ih) {
597599

598-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
600+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
601+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
599602
cublasSetStream(handle, cudaStreamHandle);
600603

601604

@@ -965,8 +968,9 @@ template <> void BaseLayer<float>::cublasRowMajorMatrixMul(const float* A, const
965968
sycl_queue_.submit([&](sycl::handler &cgh) {
966969
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
967970
cgh.host_task([=](sycl::interop_handle ih) {
968-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
969-
cublasSetStream(handle, cudaStreamHandle);
971+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
972+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
973+
cublasSetStream(handle, cudaStreamHandle);
970974

971975
ReportCUBLASErrors(cublasGemmStridedBatchedEx(
972976
handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &floatOne, B, CUDA_R_32F, N,
@@ -1022,8 +1026,9 @@ template <> void BaseLayer<float>::cublasRowMajorMatrixMul(const float* A, const
10221026
sycl_queue_.submit([&](sycl::handler &cgh) {
10231027
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
10241028
cgh.host_task([=](sycl::interop_handle ih) {
1025-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
1026-
cublasSetStream(handle, cudaStreamHandle);
1029+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
1030+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
1031+
cublasSetStream(handle, cudaStreamHandle);
10271032

10281033
// Much slower on RTX 2060.. why? Maybe a cublas bug :-/
10291034
ReportCUBLASErrors(cublasSgemmStridedBatched( handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &floatOne, B, N, N * K, A, K,
@@ -1268,11 +1273,12 @@ void Conv1Layer<float>::cublasSpecialMatrixMul(const float* A, const float* B,
12681273
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
12691274
cgh.host_task([=](sycl::interop_handle ih) {
12701275

1271-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
1272-
cublasSetStream(handle, cudaStreamHandle);
1276+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
1277+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
1278+
cublasSetStream(handle, cudaStreamHandle);
12731279

12741280

1275-
ReportCUBLASErrors(cublasGemmStridedBatchedEx(
1281+
ReportCUBLASErrors(cublasGemmStridedBatchedEx(
12761282
handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &floatOne, B, CUDA_R_32F, N,
12771283
N * K, A, CUDA_R_32F, K, 0, &floatZero, Out, CUDA_R_32F, N, N * M,
12781284
batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
@@ -1330,8 +1336,9 @@ void Conv1Layer<float>::cublasSpecialMatrixMul(const float* A, const float* B,
13301336
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
13311337
cgh.host_task([=](sycl::interop_handle ih) {
13321338

1333-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue_);
1334-
cublasSetStream(handle, cudaStreamHandle);
1339+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
1340+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
1341+
cublasSetStream(handle, cudaStreamHandle);
13351342

13361343
// Much slower on RTX 2060.. why? Maybe a cublas bug :-/
13371344
ReportCUBLASErrors(cublasSgemmStridedBatched(
@@ -1854,8 +1861,9 @@ static void cublasXgemm(transpose_type transa,
18541861

18551862
cgh.host_task([=](sycl::interop_handle ih) {
18561863

1857-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
1858-
cublasSetStream(handle, cudaStreamHandle);
1864+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
1865+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
1866+
cublasSetStream(handle, cudaStreamHandle);
18591867

18601868
ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha,
18611869
(const float*)A, lda, (const float*)B, ldb,
@@ -1941,8 +1949,9 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
19411949

19421950
cgh.host_task([=](sycl::interop_handle ih) {
19431951

1944-
auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
1945-
cublasSetStream(handle, cudaStreamHandle);
1952+
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
1953+
auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
1954+
cublasSetStream(handle, cudaStreamHandle);
19461955

19471956
ReportCUBLASErrors(cublasGemmStridedBatchedEx(
19481957
handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,

0 commit comments

Comments
 (0)