Skip to content

Commit ce75720

Browse files
Merge branch 'main' into maxime/imm-cmd-list-support
2 parents 104b5a9 + c311fe8 commit ce75720

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+1531
-380
lines changed

.github/workflows/e2e_nightly.yml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ jobs:
1111
strategy:
1212
matrix:
1313
adapter: [
14-
{name: CUDA}
14+
{name: CUDA, str_name: cuda, prefix: "ext_oneapi_", config: "--cuda --hip", unit: "gpu"},
15+
{name: OPENCL, str_name: opencl, prefix: "", config: "", unit: "cpu"}
1516
]
1617
build_type: [Release]
1718
compiler: [{c: clang, cxx: clang++}]
@@ -59,12 +60,18 @@ jobs:
5960
run: LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib
6061
cmake --build ${{github.workspace}}/ur-repo/build -j $(nproc)
6162

62-
- name: Set env vars & pre setup
63+
- name: Set prefer UR
64+
run: echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV
65+
66+
- name: Set CUDA env vars
67+
if: matrix.adapter.name == 'CUDA'
6368
run: |
64-
echo "SYCL_PREFER_UR=1" >> $GITHUB_ENV
6569
echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
6670
echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
67-
source /opt/intel/oneapi/setvars.sh
71+
72+
- name: Run pre setup
73+
run: |
74+
source /opt/intel/oneapi/setvars.sh --force
6875
sycl-ls
6976
7077
- name: Configure SYCL
@@ -73,7 +80,7 @@ jobs:
7380
-t ${{matrix.build_type}}
7481
-o ${{github.workspace}}/sycl_build
7582
--cmake-gen "Unix Makefiles"
76-
--ci-defaults --cuda --hip
83+
--ci-defaults ${{matrix.adapter.config}}
7784
--cmake-opt="-DLLVM_INSTALL_UTILS=ON"
7885
--cmake-opt="-DSYCL_PI_TESTS=OFF"
7986
--cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
@@ -91,7 +98,7 @@ jobs:
9198
- name: Swap UR loader and adapters
9299
run: |
93100
cp ${{github.workspace}}/ur-repo/build/lib/libur_loader.so* ${{github.workspace}}/sycl_build/lib/
94-
cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_cuda.so* ${{github.workspace}}/sycl_build/lib/
101+
cp ${{github.workspace}}/ur-repo/build/lib/libur_adapter_${{matrix.adapter.str_name}}.so* ${{github.workspace}}/sycl_build/lib/
95102
96103
- name: Set additional env. vars
97104
run: |
@@ -110,7 +117,7 @@ jobs:
110117
-GNinja
111118
-B ${{github.workspace}}/build-e2e/
112119
-S ${{github.workspace}}/sycl-repo/sycl/test-e2e/
113-
-DSYCL_TEST_E2E_TARGETS="ext_oneapi_cuda:gpu"
120+
-DSYCL_TEST_E2E_TARGETS="${{matrix.adapter.prefix}}${{matrix.adapter.str_name}}:${{matrix.adapter.unit}}"
114121
-DCMAKE_CXX_COMPILER="$(which clang++)"
115122
-DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
116123

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace
4747
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
4848
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
4949
"Path of the SYCL runtime library directory")
50+
option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
51+
52+
include(Assertions)
5053

5154
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
5255
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

README.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@
66
[![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
77
[![codecov.io](https://codecov.io/github/oneapi-src/unified-runtime/coverage.svg?branch=main)](https://codecov.io/github/oneapi-src/unified-runtime?branch=master)
88

9-
## Adapters
10-
Adapter implementations for Unified Runtime currently reside in the [SYCL repository](https://github.com/intel/llvm/tree/sycl/sycl/plugins/unified_runtime/ur). This branch contains scripts to automatically
11-
fetch and build them directly in the UR tree. The adapters are disabled by default,
12-
see cmake options for details.
13-
149
<!-- TODO: add general description and purpose of the project -->
1510

1611
## Table of contents

cmake/Assertions.cmake

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# From the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
# This is lifted from llvm's LLVM_ENABLE_ASSERTIONS implementation
6+
# https://github.com/llvm/llvm-project/blob/6be0e979896f7dd610abf263f845c532f1be3762/llvm/cmake/modules/HandleLLVMOptions.cmake#L89
7+
if(UR_ENABLE_ASSERTIONS)
8+
# MSVC doesn't like _DEBUG on release builds
9+
if( NOT MSVC )
10+
add_compile_definitions(_DEBUG)
11+
endif()
12+
# On non-Debug builds cmake automatically defines NDEBUG, so we
13+
# explicitly undefine it:
14+
if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
15+
add_compile_options($<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:-UNDEBUG>)
16+
if (MSVC)
17+
# Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
18+
foreach (flags_var_to_scrub
19+
CMAKE_CXX_FLAGS_RELEASE
20+
CMAKE_CXX_FLAGS_RELWITHDEBINFO
21+
CMAKE_CXX_FLAGS_MINSIZEREL
22+
CMAKE_C_FLAGS_RELEASE
23+
CMAKE_C_FLAGS_RELWITHDEBINFO
24+
CMAKE_C_FLAGS_MINSIZEREL)
25+
string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
26+
"${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
27+
endforeach()
28+
endif()
29+
endif()
30+
endif()

include/ur_api.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4037,6 +4037,9 @@ urProgramCreateWithIL(
40374037
///
40384038
/// @details
40394039
/// - The application may call this function from simultaneous threads.
4040+
/// - Following a successful call to this entry point, `phProgram` will
4041+
/// contain a binary of type ::UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or
4042+
/// ::UR_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`.
40404043
///
40414044
/// @remarks
40424045
/// _Analogues_

scripts/core/program.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ analogue:
127127
- "**clCreateProgramWithBinary**"
128128
details:
129129
- "The application may call this function from simultaneous threads."
130+
- "Following a successful call to this entry point, `phProgram` will contain a binary of type $X_PROGRAM_BINARY_TYPE_COMPILED_OBJECT or $X_PROGRAM_BINARY_TYPE_LIBRARY for `hDevice`."
130131
params:
131132
- type: $x_context_handle_t
132133
name: hContext

source/adapters/cuda/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ add_ur_adapter(${TARGET_NAME}
2727
${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
2828
${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp
2929
${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
30+
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp
31+
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp
3032
${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
3133
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
3234
${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
@@ -38,6 +40,7 @@ add_ur_adapter(${TARGET_NAME}
3840
${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp
3941
${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
4042
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
43+
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
4144
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
4245
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp
4346
)

source/adapters/cuda/command_buffer.cpp

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,91 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
9999
Params.Depth = 1;
100100
}
101101

102+
// Helper function for enqueuing memory fills
103+
static ur_result_t enqueueCommandBufferFillHelper(
104+
ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
105+
const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
106+
size_t Size, uint32_t NumSyncPointsInWaitList,
107+
const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
108+
ur_exp_command_buffer_sync_point_t *SyncPoint) {
109+
ur_result_t Result = UR_RESULT_SUCCESS;
110+
std::vector<CUgraphNode> DepsList;
111+
UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
112+
SyncPointWaitList, DepsList),
113+
Result);
114+
115+
try {
116+
const size_t N = Size / PatternSize;
117+
auto Value = *static_cast<const uint32_t *>(Pattern);
118+
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
119+
? *static_cast<CUdeviceptr *>(DstDevice)
120+
: (CUdeviceptr)DstDevice;
121+
122+
if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
123+
// Create a new node
124+
CUgraphNode GraphNode;
125+
CUDA_MEMSET_NODE_PARAMS NodeParams = {};
126+
NodeParams.dst = DstPtr;
127+
NodeParams.elementSize = PatternSize;
128+
NodeParams.height = N;
129+
NodeParams.pitch = PatternSize;
130+
NodeParams.value = Value;
131+
NodeParams.width = 1;
132+
133+
UR_CHECK_ERROR(cuGraphAddMemsetNode(
134+
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
135+
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
136+
137+
// Get sync point and register the cuNode with it.
138+
*SyncPoint =
139+
CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
140+
141+
} else {
142+
// CUDA has no memset functions that allow setting values more than 4
143+
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
144+
// fill, which can be more than 4 bytes. We must break up the pattern
145+
// into 4 byte values, and set the buffer using multiple strided calls.
146+
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
147+
// in the pattern.
148+
149+
size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
150+
151+
// we walk up the pattern in 4-byte steps, and call cuMemset for each
152+
// 4-byte chunk of the pattern.
153+
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
154+
// take 4 bytes of the pattern
155+
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
156+
157+
// offset the pointer to the part of the buffer we want to write to
158+
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
159+
160+
// Create a new node
161+
CUgraphNode GraphNode;
162+
// Update NodeParam
163+
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
164+
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
165+
NodeParamsStep.elementSize = 4;
166+
NodeParamsStep.height = N;
167+
NodeParamsStep.pitch = PatternSize;
168+
NodeParamsStep.value = Value;
169+
NodeParamsStep.width = 1;
170+
171+
UR_CHECK_ERROR(cuGraphAddMemsetNode(
172+
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
173+
DepsList.size(), &NodeParamsStep,
174+
CommandBuffer->Device->getContext()));
175+
176+
// Get sync point and register the cuNode with it.
177+
*SyncPoint = CommandBuffer->AddSyncPoint(
178+
std::make_shared<CUgraphNode>(GraphNode));
179+
}
180+
}
181+
} catch (ur_result_t Err) {
182+
Result = Err;
183+
}
184+
return Result;
185+
}
186+
102187
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
103188
ur_context_handle_t hContext, ur_device_handle_t hDevice,
104189
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
@@ -525,6 +610,119 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
525610
return Result;
526611
}
527612

613+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
614+
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
615+
size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/,
616+
uint32_t numSyncPointsInWaitList,
617+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
618+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
619+
// Prefetch cmd is not supported by Cuda Graph.
620+
// We implement it as an empty node to enforce dependencies.
621+
ur_result_t Result = UR_RESULT_SUCCESS;
622+
CUgraphNode GraphNode;
623+
624+
std::vector<CUgraphNode> DepsList;
625+
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
626+
pSyncPointWaitList, DepsList),
627+
Result);
628+
629+
try {
630+
// Add an empty node to preserve dependencies.
631+
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
632+
DepsList.data(), DepsList.size()));
633+
634+
// Get sync point and register the cuNode with it.
635+
*pSyncPoint =
636+
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
637+
638+
setErrorMessage("Prefetch hint ignored and replaced with empty node as "
639+
"prefetch is not supported by CUDA Graph backend",
640+
UR_RESULT_SUCCESS);
641+
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
642+
} catch (ur_result_t Err) {
643+
Result = Err;
644+
}
645+
return Result;
646+
}
647+
648+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
649+
ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */,
650+
size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/,
651+
uint32_t numSyncPointsInWaitList,
652+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
653+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
654+
// Mem-Advise cmd is not supported by Cuda Graph.
655+
// We implement it as an empty node to enforce dependencies.
656+
ur_result_t Result = UR_RESULT_SUCCESS;
657+
CUgraphNode GraphNode;
658+
659+
std::vector<CUgraphNode> DepsList;
660+
UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
661+
pSyncPointWaitList, DepsList),
662+
Result);
663+
664+
try {
665+
// Add an empty node to preserve dependencies.
666+
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
667+
DepsList.data(), DepsList.size()));
668+
669+
// Get sync point and register the cuNode with it.
670+
*pSyncPoint =
671+
hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
672+
673+
setErrorMessage("Memory advice ignored and replaced with empty node as "
674+
"memory advice is not supported by CUDA Graph backend",
675+
UR_RESULT_SUCCESS);
676+
Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC;
677+
} catch (ur_result_t Err) {
678+
Result = Err;
679+
}
680+
681+
return Result;
682+
}
683+
684+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
685+
ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
686+
const void *pPattern, size_t patternSize, size_t offset, size_t size,
687+
uint32_t numSyncPointsInWaitList,
688+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
689+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
690+
auto ArgsAreMultiplesOfPatternSize =
691+
(offset % patternSize == 0) || (size % patternSize == 0);
692+
693+
auto PatternIsValid = (pPattern != nullptr);
694+
695+
auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
696+
(patternSize > 0); // is a positive power of two
697+
UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
698+
PatternSizeIsValid,
699+
UR_RESULT_ERROR_INVALID_SIZE);
700+
701+
auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
702+
703+
return enqueueCommandBufferFillHelper(
704+
hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
705+
size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
706+
}
707+
708+
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
709+
ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr,
710+
const void *pPattern, size_t patternSize, size_t size,
711+
uint32_t numSyncPointsInWaitList,
712+
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
713+
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
714+
715+
auto PatternIsValid = (pPattern != nullptr);
716+
717+
auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
718+
(patternSize > 0); // is a positive power of two
719+
720+
UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
721+
return enqueueCommandBufferFillHelper(
722+
hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
723+
numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
724+
}
725+
528726
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
529727
ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
530728
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,

0 commit comments

Comments
 (0)