Skip to content

Commit 7cbbf9c

Browse files
author
Timmy
committed
Merge pull request #145 from TimmyLiu/develop
enable offline compile dtrsm kernels
2 parents 18404c0 + 1ffeb0f commit 7cbbf9c

33 files changed

+597
-2
lines changed

src/library/CMakeLists.txt

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ option( PRECOMPILE_GEMM_TRANS_CN "AutoGemm: pre-compile CN transpose cases" OFF)
9090
option( PRECOMPILE_GEMM_TRANS_CT "AutoGemm: pre-compile CT transpose cases" OFF)
9191
option( PRECOMPILE_GEMM_TRANS_CC "AutoGemm: pre-compile CC transpose cases" OFF)
9292

93+
9394
# opencl compiler version
9495
#set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." )
9596
#set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 )
@@ -291,7 +292,25 @@ source_group(AutoGemm\\src FILES ${AUTOGEMM_SRC} ${AUTOGEMM_PRECOMPILED_KERNELS}
291292
# AutoGemm End
292293
################################################################################
293294

295+
################################################################################
296+
# BEGIN Pre Compile General (static) Kernels
297+
################################################################################
298+
# options for pre-compiling trsm kernels
299+
option( PRECOMPILE_TRSM_STRSM "pre-compile available dtrsm kernels" OFF )
300+
option( PRECOMPILE_TRSM_DTRSM "pre-compile available strsm kernels" OFF )
301+
if(PRECOMPILE_TRSM_DTRSM)
302+
add_definitions(-DCLBLAS_OFFLINE_COMPILE_DTRSM)
303+
message(STATUS "precompile DTRSM kernels.")
304+
endif()
305+
if(PRECOMPILE_TRSM_STRSM)
306+
add_definitions(-DCLBLAS_OFFLINE_COMPILE_STRSM)
307+
message(STATUS "precompile STRSM kernels. (not yet implemented)")
308+
endif()
309+
294310

311+
################################################################################
312+
# END Pre Compile General (static) Kernels
313+
################################################################################
295314

296315
set(SRC_BLAS
297316
blas/init.c
@@ -670,6 +689,40 @@ ExternalProject_Add( tplgen
670689
INSTALL_COMMAND ""
671690
)
672691

692+
################OCLBinaryGenerator
693+
if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
694+
695+
696+
ExternalProject_Add( OCLBinaryGenerator
697+
URL "${CMAKE_SOURCE_DIR}/library/tools/OCLBinaryGenerator"
698+
CMAKE_ARGS -DOPENCL_LIBRARIES=${OPENCL_LIBRARIES} -DOPENCL_INCLUDE_DIRS=${OPENCL_INCLUDE_DIRS}
699+
INSTALL_COMMAND ""
700+
)
701+
ExternalProject_Get_Property( OCLBinaryGenerator binary_dir )
702+
message(STATUS "OCLBinaryGenerator binary_dir =${binary_dir}")
703+
set( OCLBinaryGeneratorBinaryDir "${binary_dir}/staging" )
704+
705+
# OCLBinaryGenerator requires at least three inputs
706+
# 1, path to the kernel file
707+
# 2, file name
708+
# 3, output directory
709+
# 4, [optional] compiler flags
710+
# 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system
711+
set( OCL_COMPILER_FLAGS " ")
712+
if( OPENCL_VERSION STREQUAL "2.0")
713+
set( OCL_COMPILER_FLAGS "-cl-std=CL2.0")
714+
endif()
715+
716+
add_custom_target( OCLBinaryGenerator_GEN )
717+
add_custom_command(TARGET OCLBinaryGenerator_GEN
718+
PRE_BUILD
719+
COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS}
720+
-P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake"
721+
)
722+
add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator )
723+
724+
endif()
725+
673726
# if offline compilation is not chosen, bingen should not be built
674727
if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL)
675728
ExternalProject_Add( bingen
@@ -801,6 +854,10 @@ add_library(clBLAS
801854
)
802855
add_dependencies(clBLAS GENERATE_CLT)
803856

857+
if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
858+
add_dependencies(clBLAS OCLBinaryGenerator_GEN)
859+
endif()
860+
804861
# AutoGemm needs compiler flag to utilize pre-compiled kernels
805862
if ( ${PRECOMPILE_GEMM_ACTIVE} )
806863
set_target_properties(clBLAS PROPERTIES COMPILE_FLAGS -DAUTOGEMM_USE_PRE_COMPILED_KERNELS)

src/library/OCLBinaryGenerator.cmake

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
message(STATUS "inside OCLBinaryGenerator.cmake")
3+
message(STATUS "OCLBinary.cmake SOURCE_DIR=${SOURCE_DIR}")
4+
message(STATUS "OCLBinary.cmake BINARY_DIR=${BINARY_DIR}")
5+
6+
execute_process(
7+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_lower_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
8+
)
9+
execute_process(
10+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
11+
)
12+
execute_process(
13+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_192_12 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
14+
)
15+
execute_process(
16+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
17+
)
18+
execute_process(
19+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
20+
)
21+
execute_process(
22+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
23+
)
24+
execute_process(
25+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
26+
)
27+
execute_process(
28+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
29+
)
30+
execute_process(
31+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
32+
)
33+
execute_process(
34+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
35+
)
36+
execute_process(
37+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
38+
)
39+
execute_process(
40+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
41+
)
42+
execute_process(
43+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
44+
)
45+
execute_process(
46+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
47+
)
48+
execute_process(
49+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
50+
)
51+
execute_process(
52+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
53+
)
54+
execute_process(
55+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
56+
)
57+
execute_process(
58+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
59+
)
60+
execute_process(
61+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
62+
)
63+
execute_process(
64+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
65+
)
66+
execute_process(
67+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_12_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
68+
)
69+
execute_process(
70+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
71+
)
72+
execute_process(
73+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
74+
)
75+
execute_process(
76+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
77+
)
78+
execute_process(
79+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
80+
)
81+
execute_process(
82+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
83+
)
84+
execute_process(
85+
COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS}
86+
)

src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,7 @@ int main( int argc, char *argv[] ) {
682682
clockStart = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec;
683683
#endif
684684
const int specialKernelCount = user_kernel_count;
685+
685686
totalKernelsToCompile = gemmPreCompileNum;
686687
totalKernelsToCompile *= 4;
687688
totalKernelsToCompile += specialKernelCount;
@@ -823,7 +824,7 @@ int main( int argc, char *argv[] ) {
823824
beta = 1.0;
824825
char *appendString = appendStringArray[i];
825826

826-
827+
827828
compileKernelAndWriteToFile<float>(
828829
context,
829830
clblasColumnMajor,
@@ -838,7 +839,7 @@ int main( int argc, char *argv[] ) {
838839
tileKernelSource,
839840
binaryBuildOptions,
840841
appendString);
841-
842+
842843
}
843844

844845
// for each kernel to be pre-compiled

src/library/blas/trtri/TrtriKernelSourceIncludes.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#ifndef TRTRI_SOURCE_INCLUDES_CPP
77
#define TRTRI_SOURCE_INCLUDES_CPP
88

9+
#ifndef CLBLAS_OFFLINE_COMPILE_DTRSM
910
/*mod 192 dtrsm*/
1011
#include "diag_dtrtri_upper_192_12.cpp"
1112
#include "triple_dgemm_update_192_12_R.cpp"
@@ -40,4 +41,41 @@
4041
#include "triple_dgemm_update_128_ABOVE64_PART2_L.cpp"
4142
#include "triple_dgemm_update_128_ABOVE64_PART3_L.cpp"
4243

44+
#else
45+
/*mod 192 dtrsm*/
46+
#include "diag_dtrtri_upper_192_12_bin.cpp"
47+
#include "triple_dgemm_update_192_12_R_bin.cpp"
48+
#include "triple_dgemm_update_192_24_PART1_R_bin.cpp"
49+
#include "triple_dgemm_update_192_24_PART2_R_bin.cpp"
50+
#include "triple_dgemm_update_192_48_PART1_R_bin.cpp"
51+
#include "triple_dgemm_update_192_48_PART2_R_bin.cpp"
52+
#include "triple_dgemm_update_192_96_PART1_R_bin.cpp"
53+
#include "triple_dgemm_update_192_96_PART2_R_bin.cpp"
54+
55+
/*mod 128 dtrsm*/
56+
/*upper*/
57+
#include "diag_dtrtri_upper_128_16_bin.cpp"
58+
#include "triple_dgemm_update_128_16_R_bin.cpp"
59+
#include "triple_dgemm_update_128_32_PART1_R_bin.cpp"
60+
#include "triple_dgemm_update_128_32_PART2_R_bin.cpp"
61+
#include "triple_dgemm_update_128_64_PART1_R_bin.cpp"
62+
#include "triple_dgemm_update_128_64_PART2_R_bin.cpp"
63+
#include "triple_dgemm_update_128_ABOVE64_PART1_R_bin.cpp"
64+
#include "triple_dgemm_update_128_ABOVE64_PART2_R_bin.cpp"
65+
#include "triple_dgemm_update_128_ABOVE64_PART3_R_bin.cpp"
66+
67+
/*lower*/
68+
#include "diag_dtrtri_lower_128_16_bin.cpp"
69+
#include "triple_dgemm_update_128_16_PART1_L_bin.cpp"
70+
#include "triple_dgemm_update_128_16_PART2_L_bin.cpp"
71+
#include "triple_dgemm_update_128_32_PART1_L_bin.cpp"
72+
#include "triple_dgemm_update_128_32_PART2_L_bin.cpp"
73+
#include "triple_dgemm_update_128_64_PART1_L_bin.cpp"
74+
#include "triple_dgemm_update_128_64_PART2_L_bin.cpp"
75+
#include "triple_dgemm_update_128_ABOVE64_PART1_L_bin.cpp"
76+
#include "triple_dgemm_update_128_ABOVE64_PART2_L_bin.cpp"
77+
#include "triple_dgemm_update_128_ABOVE64_PART3_L_bin.cpp"
78+
79+
#endif //CLBLAS_OFFLINE_COMPILE_DTRSM
80+
4381
#endif

src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
1212
#endif
1313

14+
1415
unsigned char *diag_dtrtri_lower_128_16_bin = 0;
1516
size_t diag_dtrtri_lower_128_16_binSize = 0;
1617

18+
1719
const char * const diag_dtrtri_lower_128_16_src = STRINGIFY(
1820
#define BLOCK_SIZE 16 \n
1921
#define NB 128 \n
@@ -165,5 +167,6 @@ for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n
165167
for (i = 0; i < BLOCK_SIZE; i++)\n
166168
*(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n
167169
}\n
170+
// end of kernel
168171
);
169172
#endif

src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,5 +146,6 @@ uint na)\n
146146
}\n
147147

148148
}\n
149+
// end of kernel
149150
);
150151
#endif

src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,5 +144,6 @@ for (i = 0; i < BLOCK_SIZE; i++)\n
144144

145145

146146
}\n
147+
// end of kernel
147148
);
148149
#endif

src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,5 +156,6 @@ Ain = Ain + offAin; \n
156156
//__syncthreads();
157157
barrier(CLK_LOCAL_MEM_FENCE); \n
158158
}\n
159+
// end of kernel
159160
);
160161
#endif

src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,5 +138,6 @@ Ain = Ain + offAin; \n
138138
//__syncthreads();
139139
barrier(CLK_LOCAL_MEM_FENCE); \n
140140
}\n
141+
// end of kernel
141142
);
142143
#endif

src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,5 +234,6 @@ barrier(CLK_LOCAL_MEM_FENCE); \n
234234
}\n
235235
}\n
236236
}\n
237+
// end of kernel
237238
);
238239
#endif

0 commit comments

Comments
 (0)