Skip to content

Commit 69a65bd

Browse files
committed
cmake changes, fix sycl nd_range and remove copy redundancy
1 parent 75f6820 commit 69a65bd

File tree

6 files changed

+92
-46
lines changed

6 files changed

+92
-46
lines changed

dl-cifar/CUDA/CMakeLists.txt

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ set(CMAKE_CXX_EXTENSIONS OFF)
3333
option(USE_SM "Specifies which streaming multiprocessor architecture to use" )
3434
option(DEVICE_TIMER "Build using Device Timer" OFF)
3535

36+
set(DEF_WL_CXX_FLAGS " ")
37+
set(DEF_GENERAL_CXX_FLAGS " -O2 ")
38+
set(DEF_COMBINED_CXX_FLAGS "${DEF_GENERAL_CXX_FLAGS} ${DEF_WL_CXX_FLAGS}")
39+
3640
set(SOURCES
3741

3842
basic-dl/lnorm_layer.cu
@@ -69,11 +73,21 @@ include_directories(${CMAKE_SOURCE_DIR}
6973
)
7074
message(STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}")
7175

72-
if("${CMAKE_CXX_FLAGS}" STREQUAL "")
73-
message(STATUS "Using DEFAULT CXX compilation flags for the application")
74-
string(APPEND CMAKE_CXX_FLAGS " -O2 -g")
75-
else()
76-
message(STATUS "OVERRIDING CXX compilation flags")
76+
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
77+
# -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)
78+
# passing in both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS is not allowed, in order to prevent ambiguity
79+
80+
if(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "" AND NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
81+
message(FATAL_ERROR "Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together")
82+
elseif("${CMAKE_CXX_FLAGS}" STREQUAL "" AND "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
83+
message(STATUS "Using DEFAULT compilation flags")
84+
set(CMAKE_CXX_FLAGS "${DEF_COMBINED_CXX_FLAGS}")
85+
elseif(NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
86+
message(STATUS "OVERRIDING GENERAL compilation flags")
87+
set(CMAKE_CXX_FLAGS "${OVERRIDE_GENERAL_CXX_FLAGS}")
88+
string(APPEND CMAKE_CXX_FLAGS ${DEF_WL_CXX_FLAGS})
89+
elseif(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
90+
message(STATUS "OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags")
7791
endif()
7892

7993
add_compile_options(-DRUN_ON_GPU)

dl-cifar/CUDA/handle.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,8 @@ class LangHandle {
9494
memCpyD2H(devPtr, hostPtr, size, needToSynchronize);
9595
} else if(memcpyType == D2D) {
9696
memCpyD2D(devPtr, hostPtr, size, needToSynchronize);
97-
}
98-
assertDevApiInvar(cudaMemcpy(devPtr, hostPtr, size, cudaMemcpyHostToDevice));
99-
if(needToSynchronize) {
100-
assertDevApiInvar(cudaDeviceSynchronize());
97+
} else {
98+
throw std::runtime_error("Unknown or unsupported MemcpyType");
10199
}
102100
}
103101

dl-cifar/HIP/CMakeLists.txt

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ set(CMAKE_CXX_EXTENSIONS OFF)
3131
option(DEVICE_TIMER "Build using Device Timer" OFF)
3232

3333

34-
#set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} " -Wall -O3 -Wextra -D__HIP_PLATFORM_AMD__")
35-
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} " -Wall -O3 -Wextra")
34+
35+
set(DEF_WL_CXX_FLAGS " -D__HIP_PLATFORM_AMD__ ")
36+
set(DEF_GENERAL_CXX_FLAGS " -Wall -O3 -Wextra ")
37+
set(DEF_COMBINED_CXX_FLAGS "${DEF_GENERAL_CXX_FLAGS} ${DEF_WL_CXX_FLAGS}")
38+
3639

3740
if(NOT DEFINED HIP_PATH)
3841
if(NOT DEFINED ENV{HIP_PATH})
@@ -97,11 +100,21 @@ include_directories(${CMAKE_SOURCE_DIR}
97100
)
98101

99102

100-
if("${CMAKE_CXX_FLAGS}" STREQUAL "")
101-
message(STATUS "Using DEFAULT CXX compilation flags for the application")
102-
string(APPEND CMAKE_CXX_FLAGS " -O2 -g")
103-
else()
104-
message(STATUS "OVERRIDING CXX compilation flags")
103+
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
104+
# -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)
105+
# passing in both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS is not allowed, in order to prevent ambiguity
106+
107+
if(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "" AND NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
108+
message(FATAL_ERROR "Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together")
109+
elseif("${CMAKE_CXX_FLAGS}" STREQUAL "" AND "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
110+
message(STATUS "Using DEFAULT compilation flags")
111+
set(CMAKE_CXX_FLAGS "${DEF_COMBINED_CXX_FLAGS}")
112+
elseif(NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
113+
message(STATUS "OVERRIDING GENERAL compilation flags")
114+
set(CMAKE_CXX_FLAGS "${OVERRIDE_GENERAL_CXX_FLAGS}")
115+
string(APPEND CMAKE_CXX_FLAGS ${DEF_WL_CXX_FLAGS})
116+
elseif(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
117+
message(STATUS "OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags")
105118
endif()
106119

107120
add_compile_options(-DRUN_ON_GPU)

dl-cifar/HIP/handle.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,10 @@ class LangHandle {
9494
memCpyD2H(devPtr, hostPtr, size, needToSynchronize);
9595
} else if(memcpyType == D2D) {
9696
memCpyD2D(devPtr, hostPtr, size, needToSynchronize);
97-
}
98-
assertDevApiInvar(hipMemcpy(devPtr, hostPtr, size, hipMemcpyHostToDevice));
99-
if(needToSynchronize) {
100-
assertDevApiInvar(hipDeviceSynchronize());
101-
}
97+
} else {
98+
throw std::runtime_error("Unknown or unsupported MemcpyType");
99+
}
100+
102101
}
103102

104103
void memCpyH2D(float* devPtr, const float* hostPtr, size_t size, bool needToSynchronize) {

dl-cifar/SYCL/CMakeLists.txt

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,41 @@ option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF)
3434
option(USE_AMDHIP_BACKEND "Build for AMD HIP backend" OFF)
3535
option(DEVICE_TIMER "Build using Device Timer" OFF)
3636

37-
set(INTEL_GPU_CXX_FLAGS " -O3 -fsycl -ffast-math -DMKL_ILP64")
38-
set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -ffast-math -DUSE_CUBLAS")
39-
set(AMD_GPU_CXX_FLAGS " -O3 -fsycl -ffast-math -DUSE_ROCBLAS -D__HIP_PLATFORM_AMD__")
37+
set(DEF_INTEL_WL_CXX_FLAGS " -DMKL_ILP64 ")
38+
set(DEF_NVIDIA_WL_CXX_FLAGS " -DUSE_CUBLAS ")
39+
set(DEF_AMD_WL_CXX_FLAGS " -DUSE_ROCBLAS -D__HIP_PLATFORM_AMD__ ")
40+
41+
set(DEF_INTEL_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
42+
set(DEF_NVIDIA_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
43+
set(DEF_AMD_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
44+
45+
46+
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
47+
# -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)
48+
# passing in both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS is not allowed, in order to prevent ambiguity
49+
50+
#set(USE_DEFAULT_FLAGS OFF)
51+
if(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "" AND NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
52+
message(FATAL_ERROR "Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together")
53+
elseif("${CMAKE_CXX_FLAGS}" STREQUAL "" AND "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
54+
message(STATUS "Using DEFAULT compilation flags")
55+
set(INTEL_GPU_CXX_FLAGS "${DEF_INTEL_GENERAL_CXX_FLAGS} ${DEF_INTEL_WL_CXX_FLAGS}")
56+
set(NVIDIA_GPU_CXX_FLAGS "${DEF_NVIDIA_GENERAL_CXX_FLAGS} ${DEF_NVIDIA_WL_CXX_FLAGS}")
57+
set(AMD_GPU_CXX_FLAGS "${DEF_AMD_GENERAL_CXX_FLAGS} ${DEF_AMD_WL_CXX_FLAGS}")
58+
elseif(NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "")
59+
message(STATUS "OVERRIDING GENERAL compilation flags")
60+
set(INTEL_GPU_CXX_FLAGS "${OVERRIDE_GENERAL_CXX_FLAGS} ${DEF_INTEL_WL_CXX_FLAGS}")
61+
set(NVIDIA_GPU_CXX_FLAGS "${OVERRIDE_GENERAL_CXX_FLAGS} ${DEF_NVIDIA_WL_CXX_FLAGS}")
62+
set(AMD_GPU_CXX_FLAGS "${OVERRIDE_GENERAL_CXX_FLAGS} ${DEF_AMD_WL_CXX_FLAGS}")
63+
elseif(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
64+
message(STATUS "OVERRIDING GENERAL and WORKLOAD SPECIFIC compilation flags")
65+
set(INTEL_GPU_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
66+
set(NVIDIA_GPU_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
67+
set(AMD_GPU_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
68+
endif()
69+
70+
71+
4072

4173
if(NOT DEFINED DNNLROOT AND DEFINED ENV{DNNLROOT})
4274
set(DNNLROOT "$ENV{DNNLROOT}" CACHE STRING "")
@@ -119,13 +151,7 @@ message(STATUS "DNNLROOT set to: ${DNNLROOT}")
119151
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags
120152

121153

122-
set(USE_DEFAULT_FLAGS ON)
123-
if("${CMAKE_CXX_FLAGS}" STREQUAL "")
124-
message(STATUS "Using DEFAULT compilation flags")
125-
else()
126-
message(STATUS "OVERRIDING DEFAULT compilation flags")
127-
set(USE_DEFAULT_FLAGS OFF)
128-
endif()
154+
129155

130156
add_compile_options(-DRUN_ON_GPU)
131157
#add_compile_options(-DKERNEL_USE_PROFILE "0")
@@ -148,9 +174,7 @@ endif()
148174
# JIT compilation
149175
if(GPU_AOT)
150176
message(STATUS "Enabling INTEL backend")
151-
if(USE_DEFAULT_FLAGS)
152-
set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for Intel backend
153-
endif()
177+
set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}")
154178
if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") )
155179
message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}")
156180
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 3\" ")
@@ -160,19 +184,17 @@ if(GPU_AOT)
160184
endif()
161185
elseif(USE_NVIDIA_BACKEND)
162186
message(STATUS "Enabling NVIDIA backend")
163-
if(USE_DEFAULT_FLAGS)
164-
set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend
165-
endif()
187+
set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}")
166188
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvidia_gpu_sm_${USE_SM}")
167189
elseif(USE_AMD_BACKEND)
168190
message(STATUS "Enabling AMD backend")
169191
message(STATUS "Enabling AMD HIP backend for ${USE_AMD_ARCH} AMD architecture")
170-
if(USE_DEFAULT_FLAGS)
171-
set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}") # Default flags for AMD backend (gfx908 for MI100)
172-
endif()
173-
#string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amd_gpu_gfx${USE_AMD_ARCH}")
174-
#string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND} ")
192+
set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}")
175193
string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMD_ARCH} ")
194+
else()
195+
# JIT case
196+
message(STATUS "Enabling INTEL backend")
197+
set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}")
176198
endif()
177199

178200
# Output the compiler flags that were constructed for visual inspection

dl-cifar/SYCL/basic-dl/lnorm_layer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ void LNormLayer::doFw() {
5757
float *d_output = d_output_;
5858

5959
langHandle_->getSyclQueue()->submit([&](sycl::handler& h) {
60-
h.parallel_for(sycl::range{static_cast<size_t>(gridSize), static_cast<size_t>(blockSize)}, [=](sycl::id<2> idx) {
60+
h.parallel_for(sycl::nd_range<1>{sycl::range<1>(static_cast<size_t>(gridSize*blockSize)), sycl::range<1>(static_cast<size_t>(blockSize))}, [=](sycl::nd_item<1> idx) {
6161

62-
int offset = (idx[0]*128+idx[1]) * embSize;
62+
int offset = (idx.get_global_linear_id()) * embSize;
6363

6464
float epsilon = 0.0000001;
6565
if(offset < batchSize * noOfEmbs * embSize) {
@@ -100,9 +100,9 @@ void LNormLayer::doBw() {
100100
float *d_d_input = d_d_input_;
101101

102102
langHandle_->getSyclQueue()->submit([&](sycl::handler& h) {
103-
h.parallel_for(sycl::range{static_cast<size_t>(gridSize), static_cast<size_t>(blockSize)}, [=](sycl::id<2> idx) {
103+
h.parallel_for(sycl::nd_range<1>{sycl::range<1>(static_cast<size_t>(gridSize*blockSize)), sycl::range<1>(static_cast<size_t>(blockSize))}, [=](sycl::nd_item<1> idx) {
104104

105-
int offset = (idx[0]*128+idx[1]) * embSize;
105+
int offset = (idx.get_global_linear_id()) * embSize;
106106

107107
float epsilon = 0.0000001;
108108
if(offset < batchSize * noOfEmbs * embSize) {

0 commit comments

Comments
 (0)