Skip to content

Commit 45ba4b3

Browse files
author
Timmy
committed
merge master branch to develop branch. please only make pull requests to develop branch
2 parents 902ccec + 0fc3d3f commit 45ba4b3

File tree

11 files changed

+165
-104
lines changed

11 files changed

+165
-104
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@
2323

2424
# vim temp files
2525
.*.swp
26+
27+
src/build/
28+

.travis.yml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -120,15 +120,15 @@ before_script:
120120
script:
121121
- make package
122122

123-
deploy:
124-
provider: releases
125-
prerelease: true
126-
draft: true
127-
skip_cleanup: true
128-
api_key:
129-
secure: MBkxtcfSk+4UvGRO+WRhmS86vIVzAs0LIF2sAtr/S+Ed+OdUAuhZypUsDXGWtK3mL55v9c8BZXefFfHfJqElcNmyHKwCptbCR/JiM8YBtjoy2/RW1NcJUZp+QuRlk23xPADj7QkPjv7dfrQUMitkLUXAD+uTmMe2l8gmlbhMrQqPBKhb+31FNv6Lmo6oa6GjbiGi7qjsrJc7uQjhppLam+M7BZbBALGbIqMIrb2BMDMMhBoDbb4zSKrSg3+krd3kKiCClJlK7xjIlyFXZ527ETQ+PMtIeQb0eJ3aQwa4caBRCm5BDzt8GnJ48S88EkynbQioCEE87ebcyOM7M+wfslW/Fm1Y86X5odIljkOmTNKoDvgLxc9vUCBtMyVHNIgZcToPdsrMsGxcHV+JtU3yVQVm6dnA5P/zG5bA+aBjsd7p7BdOE4fdhvZV5XRAk/wmiyWalF7hKJxHIiWAKknL+tpPDDUF+fHmDDsdf7yRDJBegNcKfw4+m19MIvLn9fbiNVCtwCAL1T4yWkIEpi4MRMDPtftmkZPbi6UwluOJUTeCeHe4en99Yu2haemNPqXs6rR0LlXGk31GQwzlrNfb+94F5tT2a4Ka4PsruA2NMW/IYCYEE5Gu7PihVDR031Fn9cdCU9kefUgyB07rJD6q/W+ljsU0osyg7VxyfMg8rkw=
130-
file: ${CLBLAS_ROOT}/clBLAS-build/*.tar.gz
131-
file_glob: true
132-
on:
133-
all_branches: true
134-
tags: true
123+
#deploy:
124+
# provider: releases
125+
# prerelease: true
126+
# draft: true
127+
# skip_cleanup: true
128+
# api_key:
129+
# secure: MBkxtcfSk+4UvGRO+WRhmS86vIVzAs0LIF2sAtr/S+Ed+OdUAuhZypUsDXGWtK3mL55v9c8BZXefFfHfJqElcNmyHKwCptbCR/JiM8YBtjoy2/RW1NcJUZp+QuRlk23xPADj7QkPjv7dfrQUMitkLUXAD+uTmMe2l8gmlbhMrQqPBKhb+31FNv6Lmo6oa6GjbiGi7qjsrJc7uQjhppLam+M7BZbBALGbIqMIrb2BMDMMhBoDbb4zSKrSg3+krd3kKiCClJlK7xjIlyFXZ527ETQ+PMtIeQb0eJ3aQwa4caBRCm5BDzt8GnJ48S88EkynbQioCEE87ebcyOM7M+wfslW/Fm1Y86X5odIljkOmTNKoDvgLxc9vUCBtMyVHNIgZcToPdsrMsGxcHV+JtU3yVQVm6dnA5P/zG5bA+aBjsd7p7BdOE4fdhvZV5XRAk/wmiyWalF7hKJxHIiWAKknL+tpPDDUF+fHmDDsdf7yRDJBegNcKfw4+m19MIvLn9fbiNVCtwCAL1T4yWkIEpi4MRMDPtftmkZPbi6UwluOJUTeCeHe4en99Yu2haemNPqXs6rR0LlXGk31GQwzlrNfb+94F5tT2a4Ka4PsruA2NMW/IYCYEE5Gu7PihVDR031Fn9cdCU9kefUgyB07rJD6q/W+ljsU0osyg7VxyfMg8rkw=
130+
# file: ${CLBLAS_ROOT}/clBLAS-build/*.tar.gz
131+
# file_glob: true
132+
# on:
133+
# all_branches: true
134+
# tags: true

README.md

Lines changed: 62 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,12 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
6868
#include <sys/types.h>
6969
#include <stdio.h>
7070

71-
/* Include the clBLAS header. It includes the appropriate OpenCL headers
72-
*/
71+
/* Include the clBLAS header. It includes the appropriate OpenCL headers */
7372
#include <clBLAS.h>
7473

7574
/* This example uses predefined matrices and their characteristics for
7675
* simplicity purpose.
77-
*/
76+
*/
7877

7978
#define M 4
8079
#define N 3
@@ -83,19 +82,19 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
8382
static const cl_float alpha = 10;
8483

8584
static const cl_float A[M*K] = {
86-
11, 12, 13, 14, 15,
87-
21, 22, 23, 24, 25,
88-
31, 32, 33, 34, 35,
89-
41, 42, 43, 44, 45,
85+
11, 12, 13, 14, 15,
86+
21, 22, 23, 24, 25,
87+
31, 32, 33, 34, 35,
88+
41, 42, 43, 44, 45,
9089
};
9190
static const size_t lda = K; /* i.e. lda = K */
9291

9392
static const cl_float B[K*N] = {
94-
11, 12, 13,
95-
21, 22, 23,
96-
31, 32, 33,
97-
41, 42, 43,
98-
51, 52, 53,
93+
11, 12, 13,
94+
21, 22, 23,
95+
31, 32, 33,
96+
41, 42, 43,
97+
51, 52, 53,
9998
};
10099
static const size_t ldb = N; /* i.e. ldb = N */
101100

@@ -113,41 +112,41 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
113112

114113
int main( void )
115114
{
116-
cl_int err;
117-
cl_platform_id platform = 0;
118-
cl_device_id device = 0;
119-
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
120-
cl_context ctx = 0;
121-
cl_command_queue queue = 0;
122-
cl_mem bufA, bufB, bufC;
123-
cl_event event = NULL;
124-
int ret = 0;
125-
126-
/* Setup OpenCL environment. */
127-
err = clGetPlatformIDs( 1, &platform, NULL );
128-
err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
129-
130-
props[1] = (cl_context_properties)platform;
131-
ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
132-
queue = clCreateCommandQueue( ctx, device, 0, &err );
133-
134-
/* Setup clBLAS */
135-
err = clblasSetup( );
136-
137-
/* Prepare OpenCL memory objects and place matrices inside them. */
138-
bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
139-
NULL, &err );
140-
bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
141-
NULL, &err );
142-
bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
143-
NULL, &err );
144-
145-
err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
146-
M * K * sizeof( *A ), A, 0, NULL, NULL );
147-
err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
148-
K * N * sizeof( *B ), B, 0, NULL, NULL );
149-
err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
150-
M * N * sizeof( *C ), C, 0, NULL, NULL );
115+
cl_int err;
116+
cl_platform_id platform = 0;
117+
cl_device_id device = 0;
118+
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
119+
cl_context ctx = 0;
120+
cl_command_queue queue = 0;
121+
cl_mem bufA, bufB, bufC;
122+
cl_event event = NULL;
123+
int ret = 0;
124+
125+
/* Setup OpenCL environment. */
126+
err = clGetPlatformIDs( 1, &platform, NULL );
127+
err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
128+
129+
props[1] = (cl_context_properties)platform;
130+
ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
131+
queue = clCreateCommandQueue( ctx, device, 0, &err );
132+
133+
/* Setup clBLAS */
134+
err = clblasSetup( );
135+
136+
/* Prepare OpenCL memory objects and place matrices inside them. */
137+
bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
138+
NULL, &err );
139+
bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
140+
NULL, &err );
141+
bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
142+
NULL, &err );
143+
144+
err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
145+
M * K * sizeof( *A ), A, 0, NULL, NULL );
146+
err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
147+
K * N * sizeof( *B ), B, 0, NULL, NULL );
148+
err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
149+
M * N * sizeof( *C ), C, 0, NULL, NULL );
151150

152151
/* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
153152
err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
@@ -157,27 +156,27 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate
157156
bufC, 0, ldc,
158157
1, &queue, 0, NULL, &event );
159158

160-
/* Wait for calculations to be finished. */
161-
err = clWaitForEvents( 1, &event );
159+
/* Wait for calculations to be finished. */
160+
err = clWaitForEvents( 1, &event );
162161

163-
/* Fetch results of calculations from GPU memory. */
164-
err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
165-
M * N * sizeof(*result),
166-
result, 0, NULL, NULL );
162+
/* Fetch results of calculations from GPU memory. */
163+
err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
164+
M * N * sizeof(*result),
165+
result, 0, NULL, NULL );
167166

168-
/* Release OpenCL memory objects. */
169-
clReleaseMemObject( bufC );
170-
clReleaseMemObject( bufB );
171-
clReleaseMemObject( bufA );
167+
/* Release OpenCL memory objects. */
168+
clReleaseMemObject( bufC );
169+
clReleaseMemObject( bufB );
170+
clReleaseMemObject( bufA );
172171

173-
/* Finalize work with clBLAS */
174-
clblasTeardown( );
172+
/* Finalize work with clBLAS */
173+
clblasTeardown( );
175174

176-
/* Release OpenCL working objects. */
177-
clReleaseCommandQueue( queue );
178-
clReleaseContext( ctx );
175+
/* Release OpenCL working objects. */
176+
clReleaseCommandQueue( queue );
177+
clReleaseContext( ctx );
179178

180-
return ret;
179+
return ret;
181180
}
182181
```
183182

appveyor.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,15 @@ on_finish:
9191
- echo on_finish step
9292

9393
# Appveyor will push the artifacts it has saved to GitHub 'releases' tab
94-
deploy:
95-
provider: GitHub
96-
auth_token:
97-
secure: dRXIWJKpU7h2RsHX7RqmyYCtCw+Q9O3X5MArloY6p34GZC1w7bp+jQYTZqbdO7bw
98-
artifact: binary_zip
99-
draft: true
100-
prerelease: true
101-
on:
102-
appveyor_repo_tag: true
94+
# deploy:
95+
# provider: GitHub
96+
# auth_token:
97+
# secure: dRXIWJKpU7h2RsHX7RqmyYCtCw+Q9O3X5MArloY6p34GZC1w7bp+jQYTZqbdO7bw
98+
# artifact: binary_zip
99+
# draft: true
100+
# prerelease: true
101+
# on:
102+
# appveyor_repo_tag: true
103103

104104
# Uncomment the following to pause the VM and wait for RDP connetion to debug
105105
# - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ if( BUILD_TEST )
199199
else()
200200
find_package( Netlib COMPONENTS BLAS REQUIRED )
201201
endif()
202-
else( )
202+
else( )
203203
# Find ACML BLAS implementation
204204
# platform dependent ACML subdirectory
205205
if (WIN32)

src/library/CMakeLists.txt

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -839,8 +839,7 @@ else()
839839
MESSAGE( STATUS "clBLAS will NOT depend on ${AUTOGEMM_PRECOMPILED_KERNELS}" )
840840
endif()
841841

842-
843-
add_library(clBLAS
842+
set(CLBLAS_ALL_SOURCES
844843
${CLBLAS_SOURCES}
845844
${GLOBAL_HEADERS}
846845
${SRC_BLAS_HEADERS}
@@ -851,9 +850,23 @@ add_library(clBLAS
851850
${AUTOGEMM_PRECOMPILED_KERNELS_CONDITIONAL}
852851
#${USERGEMM_SRC}
853852
#${USERGEMM_HEADERS}
854-
)
853+
)
854+
add_library(clBLAS ${CLBLAS_ALL_SOURCES})
855855
add_dependencies(clBLAS GENERATE_CLT)
856856

857+
function (add_target_definitions target)
858+
get_target_property(defs ${target} COMPILE_DEFINITIONS)
859+
if (defs MATCHES "NOTFOUND")
860+
set(defs "")
861+
endif ()
862+
foreach (def ${defs} ${ARGN})
863+
list(APPEND deflist ${def})
864+
endforeach ()
865+
set_target_properties(${target} PROPERTIES COMPILE_DEFINITIONS "${deflist}")
866+
endfunction ()
867+
868+
add_target_definitions(clBLAS BUILDING_CLBLAS)
869+
857870
if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM)
858871
add_dependencies(clBLAS OCLBinaryGenerator_GEN)
859872
endif()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#pragma once
2+
3+
#ifdef __cplusplus
4+
#extern "C" {
5+
#endif
6+
void initAutoGemmClKernels(void);
7+
#ifdef __cplusplus
8+
}
9+
#endif
10+

src/library/blas/AutoGemm/Includes.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,14 @@ def __init__(self):
179179
self.incStr += "#include <CL/cl.h>\n"
180180
self.incStr += "#endif\n"
181181
self.incStr += "\n"
182+
self.incStr += "#ifdef __cplusplus\n"
183+
self.incStr += "extern \"C\" {\n"
184+
self.incStr += "#endif\n"
185+
self.incStr += " void initAutoGemmClKernels(void);\n";
186+
self.incStr += "#ifdef __cplusplus\n"
187+
self.incStr += "}\n";
188+
self.incStr += "#endif\n"
189+
self.incStr += "\n";
182190

183191
self.cppName = Common.getIncludePath() + "AutoGemmClKernels.cpp"
184192
self.cppFile = open(self.cppName, "w")
@@ -190,29 +198,50 @@ def __init__(self):
190198
self.cppStr += "#endif\n"
191199
self.cppStr += "\n"
192200

201+
202+
self.initFunction = "";
203+
self.initFunction += "extern \"C\" {\n";
204+
self.initFunction += " void initAutoGemmClKernels(void);\n";
205+
self.initFunction += "}\n";
206+
self.initFunction += "\n";
207+
self.initFunction += "void initAutoGemmClKernels(void) {\n";
208+
self.defines = "";
209+
193210
def addKernel(self, kernel):
194-
kernelName = kernel.getName()
195-
self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName
196-
self.cppStr += "cl_kernel %s_clKernel = NULL;\n" % kernelName
197-
kernelName = kernel.getRowName()
198-
self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName
199-
self.cppStr += "cl_kernel %s_clKernel = NULL;\n" % kernelName
200-
kernelName = kernel.getColName()
201-
self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName
202-
self.cppStr += "cl_kernel %s_clKernel = NULL;\n" % kernelName
203-
kernelName = kernel.getCornerName()
204-
self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName
205-
self.cppStr += "cl_kernel %s_clKernel = NULL;\n" % kernelName
211+
kernelNames = [
212+
kernel.getName(),
213+
kernel.getRowName(),
214+
kernel.getColName(),
215+
kernel.getCornerName()
216+
]
217+
for kernelName in kernelNames:
218+
self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName
219+
220+
self.defines += "cl_kernel %s_clKernel = NULL;\n" % kernelName
221+
222+
self.initFunction += " if(%s_clKernel != NULL) {\n" % kernelName
223+
self.initFunction += " clReleaseKernel(%s_clKernel);\n" % kernelName
224+
self.initFunction += " %s_clKernel = NULL;\n" % kernelName
225+
self.initFunction += " }\n"
206226

207227
self.incFile.write( self.incStr )
208228
self.incStr = ""
209-
self.cppFile.write( self.cppStr )
210-
self.cppStr = ""
229+
# self.cppFile.write( self.cppStr )
230+
# self.cppStr = ""
211231

212232
def writeToFile(self):
213233
self.incFile.write( self.incStr )
214234
self.incFile.write( "\n#endif\n" )
215235
self.incFile.close()
236+
237+
self.initFunction += "}\n";
238+
self.cppStr += self.defines + "\n";
239+
self.defines = "";
240+
self.cppStr += self.initFunction + "\n";
241+
self.initFunction = "";
242+
243+
# self.cppStr += "\n";
244+
# self.cppStr += "initAutoGemmClKernels();\n";
216245
self.cppFile.write( self.cppStr )
217246
self.cppFile.close()
218247

src/library/blas/AutoGemm/KernelOpenCL.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def makeOpenCLKernelString(kernel):
218218
kStr += endLine
219219
kStr += (
220220
" /* allocate registers */" + endLine +
221-
" DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = {0};" + endLine +
221+
" DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = { {0} };" + endLine +
222222
" DATA_TYPE_STR rA[MICRO_TILE_NUM_ROWS];" + endLine +
223223
" DATA_TYPE_STR rB[MICRO_TILE_NUM_COLS];" + endLine )
224224

src/library/blas/gens/clTemplates/zgemm_gcn.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ __kernel void KERNEL_NAME(DATA_TYPE_CHAR,TRANSPOSE_A,TRANSPOSE_B,MACRO_TILE_NUM_
210210
C += offsetC;
211211

212212
// registers
213-
DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = {0};
213+
DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = { {0} };
214214
DATA_TYPE_STR rA[MICRO_TILE_NUM_ROWS];
215215
DATA_TYPE_STR rB[MICRO_TILE_NUM_COLS];
216216

0 commit comments

Comments
 (0)