From 9b322071a282a79b73b5c656b45b6c4d7cae8343 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Thu, 9 Apr 2020 11:35:17 -0700 Subject: [PATCH 01/13] Fix cuSPARSE import bug --- skcuda/cusparse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index db2c7888..b9cf4b8d 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -6,6 +6,8 @@ Note: this module does not explicitly depend on PyCUDA. """ +from __future__ import absolute_import + import atexit import ctypes.util import platform @@ -15,7 +17,7 @@ import numpy as np -import cuda +from . import cuda # Load library: _version_list = [10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] From 9863bc65a9e4a75ceb78cb0120b3ba2d47211ace Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Thu, 9 Apr 2020 16:58:44 -0700 Subject: [PATCH 02/13] Convert cuparse handle to c_void_p --- skcuda/cusparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index b9cf4b8d..45975c73 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -174,13 +174,13 @@ def cusparseCreate(): """ - handle = ctypes.c_int() + handle = ctypes.c_void_p() status = _libcusparse.cusparseCreate(ctypes.byref(handle)) cusparseCheckStatus(status) return handle.value _libcusparse.cusparseDestroy.restype = int -_libcusparse.cusparseDestroy.argtypes = [ctypes.c_int] +_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p] def cusparseDestroy(handle): """ Release CUSPARSE resources. From 104b0942ad71d919780f2506d10d35e0f0217c7c Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Thu, 9 Apr 2020 17:26:18 -0700 Subject: [PATCH 03/13] Implement cusparseSgtsv2StridedBatch_bufferSizeExt --- skcuda/cusparse.py | 68 +++++++++++++++++++++++++++++++++++++++++- tests/test_cusparse.py | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 tests/test_cusparse.py diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 45975c73..30ccdd19 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -8,7 +8,6 @@ from __future__ import absolute_import -import atexit import ctypes.util import platform from string import Template @@ -88,6 +87,7 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError): """The matrix type is not supported by this function""" pass +# TODO: Check if this is complete list of exceptions, and that numbers are correct. cusparseExceptions = { 1: cusparseStatusNotInitialized, 2: cusparseStatusAllocFailed, @@ -388,3 +388,69 @@ def cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA): # Unfinished pass + +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): + """ + Calculate size of work buffer used by cusparseSgtsv2StridedBatch. + + Parameters + ---------- + handle : ctypes.c_void_p + cuSPARSE context + m : int + Size of the linear system (must be >= 3) + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The lower diagonal dl(i) + that corresponds to the ith linear system starts at location + dl+batchStride*i in memory. Also, the first element of each lower + diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. The main diagonal d(i) + that corresponds to the ith linear system starts at location + d+batchStride*i in memory. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The upper diagonal du(i) + that corresponds to the ith linear system starts at location + du+batchStride*i in memory. Also, the last element of each upper + diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. The + right-hand-side x(i) that corresponds to the ith linear system + starts at location x+batchStride*i in memory. + batchCount : int + Number of systems to solve. + batchStride : int + Stride (number of elements) that separates the vectors of every + system (must be at least m). + + Returns + ------- + bufferSizeInBytes : int + number of bytes of the buffer used in the gtsv2StridedBatch. + + References + ---------- + `cusparsegtsv2StridedBatch_bufferSizeExt `_ + """ + bufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, + ctypes.byref(bufferSizeInBytes)) + cusparseCheckStatus(status) + return bufferSizeInBytes.value \ No newline at end of file diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py new file mode 100644 index 00000000..fa52ce1b --- /dev/null +++ b/tests/test_cusparse.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +""" +Unit tests for skcuda.cusparse +""" + +from unittest import main, makeSuite, TestCase, TestSuite + +import pycuda.gpuarray as gpuarray +from pycuda.tools import clear_context_caches, make_default_context +import numpy as np + +import skcuda.cusparse as cusparse + +class test_cusparse(TestCase): + @classmethod + def setUpClass(cls): + cls.ctx = make_default_context() + cls.cusparse_handle = cusparse.cusparseCreate() + + @classmethod + def tearDownClass(cls): + cusparse.cusparseDestroy(cls.cusparse_handle) + cls.ctx.pop() + clear_context_caches() + + def setUp(self): + np.random.seed(23) # For reproducible tests. + + # Sgtsv2StridedBatch_bufferSizeExt + def test_cusparseSgtsv2StridedBatch_bufferSizeExt(self): + m = 5 + batchCount = 5 + batchStride = m + + dl = np.zeros(m*batchCount).astype(np.float32) + d = np.zeros(m*batchCount).astype(np.float32) + du = np.zeros(m*batchCount).astype(np.float32) + x = np.zeros(m*batchCount).astype(np.float32) + + for ii in range(batchCount): + dl[ii*batchStride+1:ii*batchStride+batchStride] = np.random.rand(m-1) + d[ii*batchStride:ii*batchStride+batchStride] = np.random.rand(m) + du[ii*batchStride:ii*batchStride+batchStride-1] = np.random.rand(m-1) + x[ii*batchStride:ii*batchStride+batchStride] = np.random.rand(m) + + dl_gpu = gpuarray.to_gpu(dl) + d_gpu = gpuarray.to_gpu(d) + du_gpu = gpuarray.to_gpu(du) + x_gpu = gpuarray.to_gpu(x) + + bufferSizeInBytes = cusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( + self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata, + du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride) + +def suite(): + s = TestSuite() + s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch_bufferSizeExt')) + return s + +if __name__ == '__main__': + main(defaultTest = 'suite') From 9e0af08a2674473d83868bde6c175d72998a1ce1 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Fri, 10 Apr 2020 14:17:32 -0700 Subject: [PATCH 04/13] Support cusparseSgtsv2StridedBatch + test --- skcuda/cusparse.py | 80 +++++++++++++++++++++++++++++++++++++++++- tests/test_cusparse.py | 48 +++++++++++++++++-------- 2 files changed, 113 insertions(+), 15 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 30ccdd19..7437e2d8 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -453,4 +453,82 @@ def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, ctypes.byref(bufferSizeInBytes)) cusparseCheckStatus(status) - return bufferSizeInBytes.value \ No newline at end of file + return bufferSizeInBytes.value + +_libcusparse.cusparseSgtsv2StridedBatch.restype = int +_libcusparse.cusparseSgtsv2StridedBatch.argtypes =\ + [ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): + """ + Compute the solution of multiple tridiagonal linear systems. + + Solves multiple tridiagonal linear systems, for i=0,…,batchCount: + A(i) ∗ y(i) = x(i) + The coefficient matrix A of each of these tri-diagonal linear system is + defined with three vectors corresponding to its lower (dl), main (d), and + upper (du) matrix diagonals; the right-hand sides are stored in the dense + matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit. + The different matrices are assumed to be of the same size and are stored with + a fixed batchStride in memory. + + The routine does not perform any pivoting and uses a combination of the + Cyclic Reduction (CR) and the Parallel Cyclic Reduction (PCR) algorithms to + find the solution. It achieves better performance when m is a power of 2. + + Parameters + ---------- + handle : ctypes.c_void_p + cuSPARSE context + m : int + Size of the linear system (must be >= 3) + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The lower diagonal dl(i) + that corresponds to the ith linear system starts at location + dl+batchStride*i in memory. Also, the first element of each lower + diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. The main diagonal d(i) + that corresponds to the ith linear system starts at location + d+batchStride*i in memory. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The upper diagonal du(i) + that corresponds to the ith linear system starts at location + du+batchStride*i in memory. Also, the last element of each upper + diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. The + right-hand-side x(i) that corresponds to the ith linear system + starts at location x+batchStride*i in memory. + batchCount : int + Number of systems to solve. + batchStride : int + Stride (number of elements) that separates the vectors of every + system (must be at least m). + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsv2StridedBatch_bufferSizeExt + + Returns + ------- + bufferSizeInBytes : int + number of bytes of the buffer used in the gtsv2StridedBatch. + + References + ---------- + `cusparsegtsv2StridedBatch `_ + """ + status = _libcusparse.cusparseSgtsv2StridedBatch( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) + cusparseCheckStatus(status) \ No newline at end of file diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py index fa52ce1b..813e4969 100644 --- a/tests/test_cusparse.py +++ b/tests/test_cusparse.py @@ -4,14 +4,27 @@ Unit tests for skcuda.cusparse """ -from unittest import main, makeSuite, TestCase, TestSuite +from unittest import main, TestCase, TestSuite +import pycuda import pycuda.gpuarray as gpuarray from pycuda.tools import clear_context_caches, make_default_context import numpy as np import skcuda.cusparse as cusparse +def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride): + """ + Check all solutions from batched tridiagonal routine + """ + for ii in range(batchCount): + A_sys = np.diagflat(dl[ii*batchStride+1:ii*batchStride+m], -1) +\ + np.diagflat(d[ii*batchStride:ii*batchStride+m], 0) + \ + np.diagflat(du[ii*batchStride:ii*batchStride+m-1], 1) + x_sys = x[ii*batchStride:ii*batchStride+m] + y_sys = y[ii*batchStride:ii*batchStride+m] + assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=1e-3)) + class test_cusparse(TestCase): @classmethod def setUpClass(cls): @@ -28,21 +41,21 @@ def setUp(self): np.random.seed(23) # For reproducible tests. # Sgtsv2StridedBatch_bufferSizeExt - def test_cusparseSgtsv2StridedBatch_bufferSizeExt(self): - m = 5 - batchCount = 5 - batchStride = m + def test_cusparseSgtsv2StridedBatch(self): + m = 6 + batchCount = 9 + batchStride = 9 - dl = np.zeros(m*batchCount).astype(np.float32) - d = np.zeros(m*batchCount).astype(np.float32) - du = np.zeros(m*batchCount).astype(np.float32) - x = np.zeros(m*batchCount).astype(np.float32) + dl = np.zeros(batchStride*batchCount).astype(np.float32) + d = np.zeros(batchStride*batchCount).astype(np.float32) + du = np.zeros(batchStride*batchCount).astype(np.float32) + x = np.zeros(batchStride*batchCount).astype(np.float32) for ii in range(batchCount): - dl[ii*batchStride+1:ii*batchStride+batchStride] = np.random.rand(m-1) - d[ii*batchStride:ii*batchStride+batchStride] = np.random.rand(m) - du[ii*batchStride:ii*batchStride+batchStride-1] = np.random.rand(m-1) - x[ii*batchStride:ii*batchStride+batchStride] = np.random.rand(m) + dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1) + d[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + du[ii*batchStride:ii*batchStride+m-1] = np.random.rand(m-1) + x[ii*batchStride:ii*batchStride+m] = np.random.rand(m) dl_gpu = gpuarray.to_gpu(dl) d_gpu = gpuarray.to_gpu(d) @@ -52,10 +65,17 @@ def test_cusparseSgtsv2StridedBatch_bufferSizeExt(self): bufferSizeInBytes = cusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride) + pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes) + + cusparse.cusparseSgtsv2StridedBatch(self.cusparse_handle, m, + dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, + batchCount, batchStride, pBuffer) + + check_batch_tridiagonal(dl,d,du,x, x_gpu.get(), m,batchCount,batchStride) def suite(): s = TestSuite() - s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch_bufferSizeExt')) + s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch')) return s if __name__ == '__main__': From c483e1a8e2a063b53c05bdb9389a499c4cbb32cf Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Fri, 10 Apr 2020 14:19:08 -0700 Subject: [PATCH 05/13] Update cusparse linux_version_list --- skcuda/cusparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 7437e2d8..1a58a5db 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -19,7 +19,7 @@ from . import cuda # Load library: -_version_list = [10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] +_linux_version_list = [10.2, 10.1, 10.0, 9.2, 9.1, 9.0, 8.0, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] _win32_version_list = [10, 10, 100, 92, 91, 90, 80, 75, 70, 65, 60, 55, 50, 40] if 'linux' in sys.platform: _libcusparse_libname_list = ['libcusparse.so'] + \ From ca3a1eff0a2d267645041895c786d4a1d82dbd6b Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sat, 11 Apr 2020 13:17:56 -0700 Subject: [PATCH 06/13] Support cusparseSgtsvInterleavedBatch + test --- skcuda/cusparse.py | 145 +++++++++++++++++++++++++++++++++++++++-- tests/test_cusparse.py | 82 ++++++++++++++++++----- 2 files changed, 206 insertions(+), 21 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 1a58a5db..5f28c4c5 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -520,15 +520,148 @@ def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer: ctypes.c_void_p Buffer allocated by the user, the size is return by gtsv2StridedBatch_bufferSizeExt - Returns - ------- - bufferSizeInBytes : int - number of bytes of the buffer used in the gtsv2StridedBatch. - References ---------- `cusparsegtsv2StridedBatch `_ """ status = _libcusparse.cusparseSgtsv2StridedBatch( handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) - cusparseCheckStatus(status) \ No newline at end of file + cusparseCheckStatus(status) + +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): + """ + Calculate size of work buffer used by cusparseSgtsvInterleavedBatch. + + Parameters + ---------- + handle : ctypes.c_void_p + cuSPARSE context + algo : int + algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting + (stable algorithm); algo = 2: QR (stable algorithm) + m : int + Size of the linear system + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The first element of each + lower diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The last element of each + upper diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. + batchCount : int + Number of systems to solve. + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt + + References + ---------- + `cusparsegtsvInterleavedBatch `_ + """ + pBufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, + ctypes.byref(pBufferSizeInBytes)) + cusparseCheckStatus(status) + return pBufferSizeInBytes.value + +_libcusparse.cusparseSgtsvInterleavedBatch.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch.argtypes =\ + [ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): + """ + Compute the solution of multiple tridiagonal linear systems. + + Solves multiple tridiagonal linear systems, for i=0,…,batchCount: + A(i) ∗ y(i) = x(i) + The coefficient matrix A of each of these tri-diagonal linear system is + defined with three vectors corresponding to its lower (dl), main (d), and + upper (du) matrix diagonals; the right-hand sides are stored in the dense + matrix X. Notice that solution Y overwrites right-hand-side matrix X on exit. + The different matrices are assumed to be of the same size and are stored with + a fixed batchStride in memory. + + Assuming A is of size m and base-1, dl, d and du are defined by the following formula: + dl(i) := A(i, i-1) for i=1,2,...,m + The first element of dl is out-of-bound (dl(1) := A(1,0)), so dl(1) = 0. + d(i) = A(i,i) for i=1,2,...,m + du(i) = A(i,i+1) for i=1,2,...,m + The last element of du is out-of-bound (du(m) := A(m,m+1)), so du(m) = 0. + + The data layout is different from gtsvStridedBatch which aggregates all + matrices one after another. Instead, gtsvInterleavedBatch gathers + different matrices of the same element in a continous manner. If dl is + regarded as a 2-D array of size m-by-batchCount, dl(:,j) to store j-th + matrix. gtsvStridedBatch uses column-major while gtsvInterleavedBatch + uses row-major. + + The routine provides three different algorithms, selected by parameter algo. + The first algorithm is cuThomas provided by Barcelona Supercomputing Center. + The second algorithm is LU with partial pivoting and last algorithm is QR. + From stability perspective, cuThomas is not numerically stable because it + does not have pivoting. LU with partial pivoting and QR are stable. From + performance perspective, LU with partial pivoting and QR is about 10% to 20% + slower than cuThomas. + + Parameters + ---------- + handle : ctypes.c_void_p + cuSPARSE context + algo : int + algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting + (stable algorithm); algo = 2: QR (stable algorithm) + m : int + Size of the linear system + dl : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the lower + diagonal of the tri-diagonal linear system. The first element of each + lower diagonal must be zero. + d : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the main + diagonal of the tri-diagonal linear system. + du : ctypes.c_void_p + Pointer to ${precision} ${real} dense array containing the upper + diagonal of the tri-diagonal linear system. The last element of each + upper diagonal must be zero. + x : ctypes.c_void_p + Pointer to ${precision} ${real} dense array that contains the + right-hand-side of the tri-diagonal linear system. + batchCount : int + Number of systems to solve. + pBuffer: ctypes.c_void_p + Buffer allocated by the user, the size is return by gtsvInterleavedBatch_bufferSizeExt + + References + ---------- + `cusparsegtsvInterleavedBatch `_ + """ + status = _libcusparse.cusparseSgtsvInterleavedBatch( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) + cusparseCheckStatus(status) diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py index 813e4969..3856124a 100644 --- a/tests/test_cusparse.py +++ b/tests/test_cusparse.py @@ -13,17 +13,42 @@ import skcuda.cusparse as cusparse -def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride): +def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride=None,atol=1e-8): """ Check all solutions from batched tridiagonal routine """ + if batchStride is None: + batchStride = m + for ii in range(batchCount): A_sys = np.diagflat(dl[ii*batchStride+1:ii*batchStride+m], -1) +\ np.diagflat(d[ii*batchStride:ii*batchStride+m], 0) + \ np.diagflat(du[ii*batchStride:ii*batchStride+m-1], 1) x_sys = x[ii*batchStride:ii*batchStride+m] y_sys = y[ii*batchStride:ii*batchStride+m] - assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=1e-3)) + assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=atol)) + +def tridiagonal_system(m, batchCount, batchStride=None, seed=None): + """ + Create a tridiagonal system of a given size + """ + if batchStride is None: + batchStride = m + if seed is not None: + np.random.seed(seed) + + dl = np.zeros(batchStride*batchCount).astype(np.float32) + d = np.zeros(batchStride*batchCount).astype(np.float32) + du = np.zeros(batchStride*batchCount).astype(np.float32) + x = np.zeros(batchStride*batchCount).astype(np.float32) + + for ii in range(batchCount): + dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1) + d[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + du[ii*batchStride:ii*batchStride+m-1] = np.random.rand(m-1) + x[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + + return dl,d,du,x class test_cusparse(TestCase): @classmethod @@ -40,22 +65,12 @@ def tearDownClass(cls): def setUp(self): np.random.seed(23) # For reproducible tests. - # Sgtsv2StridedBatch_bufferSizeExt def test_cusparseSgtsv2StridedBatch(self): m = 6 batchCount = 9 - batchStride = 9 - - dl = np.zeros(batchStride*batchCount).astype(np.float32) - d = np.zeros(batchStride*batchCount).astype(np.float32) - du = np.zeros(batchStride*batchCount).astype(np.float32) - x = np.zeros(batchStride*batchCount).astype(np.float32) + batchStride = 11 - for ii in range(batchCount): - dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1) - d[ii*batchStride:ii*batchStride+m] = np.random.rand(m) - du[ii*batchStride:ii*batchStride+m-1] = np.random.rand(m-1) - x[ii*batchStride:ii*batchStride+m] = np.random.rand(m) + dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23) dl_gpu = gpuarray.to_gpu(dl) d_gpu = gpuarray.to_gpu(d) @@ -71,11 +86,48 @@ def test_cusparseSgtsv2StridedBatch(self): dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride, pBuffer) - check_batch_tridiagonal(dl,d,du,x, x_gpu.get(), m,batchCount,batchStride) + sln = x_gpu.get() + # For unstable algorithms, need to loosen atol + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride, 1e-4) + + def test_cusparseSgtsvInterleavedBatch(self): + m = 6 + batchCount = 9 + + dl,d,du,x = tridiagonal_system(m, batchCount, seed=23) + + # Convert to interleaved format, by switching from row-major order + # (numpy default) to column-major + dl_int = np.reshape(dl,(batchCount,m)).ravel('F') + d_int = np.reshape(d, (batchCount, m)).ravel('F') + du_int = np.reshape(du,(batchCount,m)).ravel('F') + x_int = np.reshape(x,(batchCount,m)).ravel('F') + + for algo in range(3): + dl_int_gpu = gpuarray.to_gpu(dl_int) + d_int_gpu = gpuarray.to_gpu(d_int) + du_int_gpu = gpuarray.to_gpu(du_int) + x_int_gpu = gpuarray.to_gpu(x_int) + + pBufferSizeInBytes = cusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle, + algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata, + du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount) + + pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes) + + cusparse.cusparseSgtsvInterleavedBatch(self.cusparse_handle, algo, m, + dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata, + x_int_gpu.gpudata, batchCount, pBuffer) + + sln_int = x_int_gpu.get() + # Convert back from interleaved format + sln = np.reshape(sln_int,(m,batchCount)).ravel('F') + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount, atol=1e-6) def suite(): s = TestSuite() s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch')) + s.addTest(test_cusparse('test_cusparseSgtsvInterleavedBatch')) return s if __name__ == '__main__': From 5ea00775ab278d5bd4448567d09ae7c278d8fdc0 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sat, 11 Apr 2020 14:24:08 -0700 Subject: [PATCH 07/13] support double precision batched tridiagonal --- skcuda/cusparse.py | 150 ++++++++++++++++++++++++++--------------- tests/test_cusparse.py | 74 ++++++++++++++++++-- 2 files changed, 165 insertions(+), 59 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 5f28c4c5..ecc408e0 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -389,21 +389,9 @@ def cusparseSdense2csr(handle, m, n, descrA, A, lda, # Unfinished pass -_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.restype = int -_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes =\ - [ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_void_p - ] -def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): +gtsv2StridedBatch_bufferSizeExt_doc = Template( """ - Calculate size of work buffer used by cusparseSgtsv2StridedBatch. + Calculate size of work buffer used by cusparsegtsv2StridedBatch. Parameters ---------- @@ -448,26 +436,38 @@ def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount ---------- `cusparsegtsv2StridedBatch_bufferSizeExt `_ """ +) + +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): bufferSizeInBytes = ctypes.c_int() status = _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt( handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, ctypes.byref(bufferSizeInBytes)) cusparseCheckStatus(status) return bufferSizeInBytes.value +cusparseSgtsv2StridedBatch_bufferSizeExt.__doc__ = \ + gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real') -_libcusparse.cusparseSgtsv2StridedBatch.restype = int -_libcusparse.cusparseSgtsv2StridedBatch.argtypes =\ - [ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_void_p - ] -def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): +_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt.argtypes =\ + _libcusparse.cusparseSgtsv2StridedBatch_bufferSizeExt.argtypes +def cusparseDgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount, batchStride): + bufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseDgtsv2StridedBatch_bufferSizeExt( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, + ctypes.byref(bufferSizeInBytes)) + cusparseCheckStatus(status) + return bufferSizeInBytes.value +cusparseDgtsv2StridedBatch_bufferSizeExt.__doc__ = \ + gtsv2StridedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real') + +gtsv2StridedBatch_doc = Template( """ Compute the solution of multiple tridiagonal linear systems. @@ -524,25 +524,34 @@ def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, ---------- `cusparsegtsv2StridedBatch `_ """ +) + +_libcusparse.cusparseSgtsv2StridedBatch.restype = int +_libcusparse.cusparseSgtsv2StridedBatch.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): status = _libcusparse.cusparseSgtsv2StridedBatch( handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) cusparseCheckStatus(status) +cusparseSgtsv2StridedBatch.__doc__ = \ + gtsv2StridedBatch_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsv2StridedBatch.restype = int +_libcusparse.cusparseDgtsv2StridedBatch.argtypes =\ + _libcusparse.cusparseSgtsv2StridedBatch.argtypes +def cusparseDgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer): + status = _libcusparse.cusparseDgtsv2StridedBatch( + handle, m, int(dl), int(d), int(du), int(x), batchCount, batchStride, int(pBuffer)) + cusparseCheckStatus(status) +cusparseDgtsv2StridedBatch.__doc__ = \ + gtsv2StridedBatch_doc.substitute(precision='double precision', real='real') -_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.restype = int -_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes =\ - [ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p - ] -def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): +gtsv2InterleavedBatch_bufferSizeExt_doc = Template( """ - Calculate size of work buffer used by cusparseSgtsvInterleavedBatch. + Calculate size of work buffer used by cusparsegtsvInterleavedBatch. Parameters ---------- @@ -576,26 +585,38 @@ def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, b ---------- `cusparsegtsvInterleavedBatch `_ """ +) + +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): pBufferSizeInBytes = ctypes.c_int() status = _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt( handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, ctypes.byref(pBufferSizeInBytes)) cusparseCheckStatus(status) return pBufferSizeInBytes.value +cusparseSgtsvInterleavedBatch_bufferSizeExt.__doc__ = \ + gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='single precision', real='real') -_libcusparse.cusparseSgtsvInterleavedBatch.restype = int -_libcusparse.cusparseSgtsvInterleavedBatch.argtypes =\ - [ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p - ] -def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): +_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.restype = int +_libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt.argtypes =\ + _libcusparse.cusparseSgtsvInterleavedBatch_bufferSizeExt.argtypes +def cusparseDgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, batchCount): + pBufferSizeInBytes = ctypes.c_int() + status = _libcusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, + ctypes.byref(pBufferSizeInBytes)) + cusparseCheckStatus(status) + return pBufferSizeInBytes.value +cusparseDgtsvInterleavedBatch_bufferSizeExt.__doc__ = \ + gtsv2InterleavedBatch_bufferSizeExt_doc.substitute(precision='double precision', real='real') + +gtsvInterleavedBatch_doc = Template( """ Compute the solution of multiple tridiagonal linear systems. @@ -662,6 +683,27 @@ def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBu ---------- `cusparsegtsvInterleavedBatch `_ """ +) + +_libcusparse.cusparseSgtsvInterleavedBatch.restype = int +_libcusparse.cusparseSgtsvInterleavedBatch.argtypes =\ + [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, + ctypes.c_void_p + ] +def cusparseSgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): status = _libcusparse.cusparseSgtsvInterleavedBatch( handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) cusparseCheckStatus(status) +cusparseSgtsvInterleavedBatch.__doc__ = \ + gtsvInterleavedBatch_doc.substitute(precision='single precision', real='real') + +_libcusparse.cusparseDgtsvInterleavedBatch.restype = int +_libcusparse.cusparseDgtsvInterleavedBatch.argtypes =\ + _libcusparse.cusparseSgtsvInterleavedBatch.argtypes +def cusparseDgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBuffer): + status = _libcusparse.cusparseDgtsvInterleavedBatch( + handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) + cusparseCheckStatus(status) +cusparseDgtsvInterleavedBatch.__doc__ = \ + gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') \ No newline at end of file diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py index 3856124a..caac2c86 100644 --- a/tests/test_cusparse.py +++ b/tests/test_cusparse.py @@ -28,7 +28,8 @@ def check_batch_tridiagonal(dl,d,du,x, y, m,batchCount,batchStride=None,atol=1e- y_sys = y[ii*batchStride:ii*batchStride+m] assert(np.allclose(np.dot(A_sys,y_sys), x_sys, atol=atol)) -def tridiagonal_system(m, batchCount, batchStride=None, seed=None): +def tridiagonal_system(m, batchCount, batchStride=None, seed=None, + dtype=np.float32): """ Create a tridiagonal system of a given size """ @@ -37,10 +38,10 @@ def tridiagonal_system(m, batchCount, batchStride=None, seed=None): if seed is not None: np.random.seed(seed) - dl = np.zeros(batchStride*batchCount).astype(np.float32) - d = np.zeros(batchStride*batchCount).astype(np.float32) - du = np.zeros(batchStride*batchCount).astype(np.float32) - x = np.zeros(batchStride*batchCount).astype(np.float32) + dl = np.zeros(batchStride*batchCount).astype(dtype) + d = np.zeros(batchStride*batchCount).astype(dtype) + du = np.zeros(batchStride*batchCount).astype(dtype) + x = np.zeros(batchStride*batchCount).astype(dtype) for ii in range(batchCount): dl[ii*batchStride+1:ii*batchStride+m] = np.random.rand(m-1) @@ -90,6 +91,32 @@ def test_cusparseSgtsv2StridedBatch(self): # For unstable algorithms, need to loosen atol check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride, 1e-4) + def test_cusparseDgtsv2StridedBatch(self): + m = 6 + batchCount = 9 + batchStride = 11 + + dl,d,du,x = tridiagonal_system(m, batchCount, batchStride, seed=23, + dtype=np.float64) + + dl_gpu = gpuarray.to_gpu(dl) + d_gpu = gpuarray.to_gpu(d) + du_gpu = gpuarray.to_gpu(du) + x_gpu = gpuarray.to_gpu(x) + + bufferSizeInBytes = cusparse.cusparseDgtsv2StridedBatch_bufferSizeExt( + self.cusparse_handle, m, dl_gpu.gpudata, d_gpu.gpudata, + du_gpu.gpudata, x_gpu.gpudata, batchCount, batchStride) + pBuffer = pycuda.driver.mem_alloc(bufferSizeInBytes) + + cusparse.cusparseDgtsv2StridedBatch(self.cusparse_handle, m, + dl_gpu.gpudata, d_gpu.gpudata, du_gpu.gpudata, x_gpu.gpudata, + batchCount, batchStride, pBuffer) + + sln = x_gpu.get() + # For unstable algorithms, need to loosen atol + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount,batchStride) + def test_cusparseSgtsvInterleavedBatch(self): m = 6 batchCount = 9 @@ -124,10 +151,47 @@ def test_cusparseSgtsvInterleavedBatch(self): sln = np.reshape(sln_int,(m,batchCount)).ravel('F') check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount, atol=1e-6) + def test_cusparseDgtsvInterleavedBatch(self): + m = 6 + batchCount = 9 + + dl,d,du,x = tridiagonal_system(m, batchCount, seed=23, + dtype=np.float64) + + # Convert to interleaved format, by switching from row-major order + # (numpy default) to column-major + dl_int = np.reshape(dl,(batchCount,m)).ravel('F') + d_int = np.reshape(d, (batchCount, m)).ravel('F') + du_int = np.reshape(du,(batchCount,m)).ravel('F') + x_int = np.reshape(x,(batchCount,m)).ravel('F') + + for algo in range(3): + dl_int_gpu = gpuarray.to_gpu(dl_int) + d_int_gpu = gpuarray.to_gpu(d_int) + du_int_gpu = gpuarray.to_gpu(du_int) + x_int_gpu = gpuarray.to_gpu(x_int) + + pBufferSizeInBytes = cusparse.cusparseDgtsvInterleavedBatch_bufferSizeExt(self.cusparse_handle, + algo, m, dl_int_gpu.gpudata, d_int_gpu.gpudata, + du_int_gpu.gpudata, x_int_gpu.gpudata, batchCount) + + pBuffer = pycuda.driver.mem_alloc(pBufferSizeInBytes) + + cusparse.cusparseDgtsvInterleavedBatch(self.cusparse_handle, algo, m, + dl_int_gpu.gpudata, d_int_gpu.gpudata, du_int_gpu.gpudata, + x_int_gpu.gpudata, batchCount, pBuffer) + + sln_int = x_int_gpu.get() + # Convert back from interleaved format + sln = np.reshape(sln_int,(m,batchCount)).ravel('F') + check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount) + def suite(): s = TestSuite() s.addTest(test_cusparse('test_cusparseSgtsv2StridedBatch')) + s.addTest(test_cusparse('test_cusparseDgtsv2StridedBatch')) s.addTest(test_cusparse('test_cusparseSgtsvInterleavedBatch')) + s.addTest(test_cusparse('test_cusparseDgtsvInterleavedBatch')) return s if __name__ == '__main__': From 7ca4deecc43ab05714586095d82ec8d1ab01fb70 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sat, 11 Apr 2020 18:55:36 -0700 Subject: [PATCH 08/13] Add cusparse to documentation --- docs/source/reference.rst | 1 + skcuda/cusparse.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/reference.rst b/docs/source/reference.rst index c8e59ccc..c3ef351a 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -13,6 +13,7 @@ Library Wrapper Routines reference_cusolver reference_cula reference_pcula + reference_cusparse High-Level Routines ------------------- diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index ecc408e0..3a67ff03 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -200,6 +200,7 @@ def cusparseDestroy(handle): _libcusparse.cusparseGetVersion.restype = int _libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, ctypes.c_void_p] +# XXX: Test def cusparseGetVersion(handle): """ Return CUSPARSE library version. @@ -227,6 +228,7 @@ def cusparseGetVersion(handle): _libcusparse.cusparseSetStream.restype = int _libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, ctypes.c_int] +# XXX: Test. Check for cusparseGetStream def cusparseSetStream(handle, id): """ Sets the CUSPARSE stream in which kernels will run. @@ -706,4 +708,4 @@ def cusparseDgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBu handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) cusparseCheckStatus(status) cusparseDgtsvInterleavedBatch.__doc__ = \ - gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') \ No newline at end of file + gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') From e77bc6961340cce86efd47a1886c58a2b639c9f1 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sun, 12 Apr 2020 13:55:17 -0700 Subject: [PATCH 09/13] cusparse Get/SetStream and GetVersion --- skcuda/cusparse.py | 47 +++++++++++++++++++++++++----------------- tests/test_cusparse.py | 13 ++++++++++++ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index ecc408e0..64ba2a60 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -149,9 +149,7 @@ def cusparseCheckStatus(status): See Also -------- cusparseExceptions - """ - if status != 0: try: raise cusparseExceptions[status] @@ -171,9 +169,7 @@ def cusparseCreate(): ------- handle : int CUSPARSE library context. - """ - handle = ctypes.c_void_p() status = _libcusparse.cusparseCreate(ctypes.byref(handle)) cusparseCheckStatus(status) @@ -185,21 +181,18 @@ def cusparseDestroy(handle): """ Release CUSPARSE resources. - Releases hardware resources used by CUSPARSE + Releases hardware resources used by CUSPARSE. Parameters ---------- handle : int CUSPARSE library context. - """ - status = _libcusparse.cusparseDestroy(handle) cusparseCheckStatus(status) _libcusparse.cusparseGetVersion.restype = int -_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, - ctypes.c_void_p] +_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_void_p, ctypes.c_void_p] def cusparseGetVersion(handle): """ Return CUSPARSE library version. @@ -215,9 +208,7 @@ def cusparseGetVersion(handle): ------- version : int CUSPARSE library version number. - """ - version = ctypes.c_int() status = _libcusparse.cusparseGetVersion(handle, ctypes.byref(version)) @@ -225,8 +216,7 @@ def cusparseGetVersion(handle): return version.value _libcusparse.cusparseSetStream.restype = int -_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, - ctypes.c_int] +_libcusparse.cusparseSetStream.argtypes = [ctypes.c_void_p, ctypes.c_int] def cusparseSetStream(handle, id): """ Sets the CUSPARSE stream in which kernels will run. @@ -237,11 +227,30 @@ def cusparseSetStream(handle, id): CUSPARSE library context. id : int Stream ID. + """ + status = _libcusparse.cusparseSetStream(handle, id) + cusparseCheckStatus(status) +_libcusparse.cusparseGetStream.restype = int +_libcusparse.cusparseGetStream.argtypes = [ctypes.c_void_p, ctypes.c_void_p] +def cusparseGetStream(handle): """ + Gets the CUSPARSE stream in which kernels will run. - status = _libcusparse.cusparseSetStream(handle, id) + Parameters + ---------- + handle : int + CUSPARSE library context. + + Returns + ------- + handle : int + CUSPARSE library context. + """ + id = ctypes.c_int() + status = _libcusparse.cusparseGetStream(handle, ctypes.byref(id)) cusparseCheckStatus(status) + return id.value _libcusparse.cusparseCreateMatDescr.restype = int _libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr] @@ -395,7 +404,7 @@ def cusparseSdense2csr(handle, m, n, descrA, A, lda, Parameters ---------- - handle : ctypes.c_void_p + handle : int cuSPARSE context m : int Size of the linear system (must be >= 3) @@ -486,7 +495,7 @@ def cusparseDgtsv2StridedBatch_bufferSizeExt(handle, m, dl, d, du, x, batchCount Parameters ---------- - handle : ctypes.c_void_p + handle : int cuSPARSE context m : int Size of the linear system (must be >= 3) @@ -555,7 +564,7 @@ def cusparseDgtsv2StridedBatch(handle, m, dl, d, du, x, batchCount, batchStride, Parameters ---------- - handle : ctypes.c_void_p + handle : int cuSPARSE context algo : int algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting @@ -653,7 +662,7 @@ def cusparseDgtsvInterleavedBatch_bufferSizeExt(handle, algo, m, dl, d, du, x, b Parameters ---------- - handle : ctypes.c_void_p + handle : int cuSPARSE context algo : int algo = 0: cuThomas (unstable algorithm); algo = 1: LU with pivoting @@ -706,4 +715,4 @@ def cusparseDgtsvInterleavedBatch(handle, algo, m, dl, d, du, x, batchCount, pBu handle, algo, m, int(dl), int(d), int(du), int(x), batchCount, int(pBuffer)) cusparseCheckStatus(status) cusparseDgtsvInterleavedBatch.__doc__ = \ - gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') \ No newline at end of file + gtsvInterleavedBatch_doc.substitute(precision='double precision', real='real') diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py index caac2c86..469bfc68 100644 --- a/tests/test_cusparse.py +++ b/tests/test_cusparse.py @@ -185,6 +185,16 @@ def test_cusparseDgtsvInterleavedBatch(self): # Convert back from interleaved format sln = np.reshape(sln_int,(m,batchCount)).ravel('F') check_batch_tridiagonal(dl,d,du,x, sln, m,batchCount) + + def test_cusparseGetSetStream(self): + initial_stream = cusparse.cusparseGetStream(self.cusparse_handle) + # Switch stream + cusparse.cusparseSetStream(self.cusparse_handle, initial_stream+1) + final_stream = cusparse.cusparseGetStream(self.cusparse_handle) + assert(final_stream == initial_stream+1) + + def test_cusparseGetVersion(self): + cusparse.cusparseGetVersion(self.cusparse_handle) def suite(): s = TestSuite() @@ -192,6 +202,9 @@ def suite(): s.addTest(test_cusparse('test_cusparseDgtsv2StridedBatch')) s.addTest(test_cusparse('test_cusparseSgtsvInterleavedBatch')) s.addTest(test_cusparse('test_cusparseDgtsvInterleavedBatch')) + + s.addTest(test_cusparse('test_cusparseGetSetStream')) + s.addTest(test_cusparse('test_cusparseGetVersion')) return s if __name__ == '__main__': From f2c76dc824d6bb5ec5e1ed2975dddac31e07918e Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sun, 12 Apr 2020 14:08:52 -0700 Subject: [PATCH 10/13] Add missing cusparse documenation file --- docs/source/reference_cusparse.rst | 43 ++++++++++++++++++++++++++++++ skcuda/cusparse.py | 12 --------- 2 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 docs/source/reference_cusparse.rst diff --git a/docs/source/reference_cusparse.rst b/docs/source/reference_cusparse.rst new file mode 100644 index 00000000..e14a5ab3 --- /dev/null +++ b/docs/source/reference_cusparse.rst @@ -0,0 +1,43 @@ +.. -*- rst -*- + +.. currentmodule:: skcuda.cusparse + +CUSPARSE Routines +================= + +Helper Routines +--------------- +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseCreate + cusparseDestroy + cusparseGetVersion + cusparseSetStream + cusparseGetStream + +Wrapper Routines +---------------- + +Single Precision Routines +^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseSgtsv2StridedBatch_bufferSizeExt + cusparseSgtsv2StridedBatch + cusparseSgtsvInterleavedBatch_bufferSizeExt + cusparseSgtsvInterleavedBatch + +Double Precision Routines +^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autosummary:: + :toctree: generated/ + :nosignatures: + + cusparseDgtsv2StridedBatch_bufferSizeExt + cusparseDgtsv2StridedBatch + cusparseDgtsvInterleavedBatch_bufferSizeExt + cusparseDgtsvInterleavedBatch diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 38306c17..64ba2a60 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -192,13 +192,7 @@ def cusparseDestroy(handle): cusparseCheckStatus(status) _libcusparse.cusparseGetVersion.restype = int -<<<<<<< HEAD _libcusparse.cusparseGetVersion.argtypes = [ctypes.c_void_p, ctypes.c_void_p] -======= -_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, - ctypes.c_void_p] -# XXX: Test ->>>>>>> 7ca4deecc43ab05714586095d82ec8d1ab01fb70 def cusparseGetVersion(handle): """ Return CUSPARSE library version. @@ -222,13 +216,7 @@ def cusparseGetVersion(handle): return version.value _libcusparse.cusparseSetStream.restype = int -<<<<<<< HEAD _libcusparse.cusparseSetStream.argtypes = [ctypes.c_void_p, ctypes.c_int] -======= -_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, - ctypes.c_int] -# XXX: Test. Check for cusparseGetStream ->>>>>>> 7ca4deecc43ab05714586095d82ec8d1ab01fb70 def cusparseSetStream(handle, id): """ Sets the CUSPARSE stream in which kernels will run. From c5f779f97f2ed602e87ddcb7f8905aab58b78dcf Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sun, 12 Apr 2020 14:21:29 -0700 Subject: [PATCH 11/13] Remove unfinished cusparse methods --- skcuda/cusparse.py | 146 --------------------------------------------- 1 file changed, 146 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 64ba2a60..164155f8 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -252,152 +252,6 @@ def cusparseGetStream(handle): cusparseCheckStatus(status) return id.value -_libcusparse.cusparseCreateMatDescr.restype = int -_libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr] -def cusparseCreateMatDescr(): - """ - Initialize a sparse matrix descriptor. - - Initializes the `MatrixType` and `IndexBase` fields of the matrix - descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL` - and `CUSPARSE_INDEX_BASE_ZERO`. - - Returns - ------- - desc : cusparseMatDescr - Matrix descriptor. - - """ - - desc = cusparseMatrixDesc() - status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc)) - cusparseCheckStatus(status) - return desc - -_libcusparse.cusparseDestroyMatDescr.restype = int -_libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int] -def cusparseDestroyMatDescr(desc): - """ - Releases the memory allocated for the matrix descriptor. - - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. - - """ - - status = _libcusparse.cusparseDestroyMatDescr(desc) - cusparseCheckStatus(status) - -_libcusparse.cusparseSetMatType.restype = int -_libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr, - ctypes.c_int] -def cusparseSetMatType(desc, type): - """ - Sets the matrix type of the specified matrix. - - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. - type : int - Matrix type. - - """ - - status = _libcusparse.cusparseSetMatType(desc, type) - cusparseCheckStatus(status) - -_libcusparse.cusparseGetMatType.restype = int -_libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr] -def cusparseGetMatType(desc): - """ - Gets the matrix type of the specified matrix. - - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. - - Returns - ------- - type : int - Matrix type. - - """ - - return _libcusparse.cusparseGetMatType(desc) - -# Format conversion functions: -_libcusparse.cusparseSnnz.restype = int -_libcusparse.cusparseSnnz.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, - nnzPerRowColumn, nnzTotalDevHostPtr): - """ - Compute number of non-zero elements per row, column, or dense matrix. - - Parameters - ---------- - handle : int - CUSPARSE library context. - dirA : int - Data direction of elements. - m : int - Rows in A. - n : int - Columns in A. - descrA : cusparseMatDescr - Matrix descriptor. - A : pycuda.gpuarray.GPUArray - Dense matrix of dimensions (lda, n). - lda : int - Leading dimension of A. - - Returns - ------- - nnzPerRowColumn : pycuda.gpuarray.GPUArray - Array of length m or n containing the number of - non-zero elements per row or column, respectively. - nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray - Total number of non-zero elements in device or host memory. - - """ - - # Unfinished: - nnzPerRowColumn = gpuarray.empty() - nnzTotalDevHostPtr = gpuarray.empty() - - status = _libcusparse.cusparseSnnz(handle, dirA, m, n, - descrA, int(A), lda, - int(nnzPerRowColumn), int(nnzTotalDevHostPtr)) - cusparseCheckStatus(status) - return nnzPerVector, nnzHost - -_libcusparse.cusparseSdense2csr.restype = int -_libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSdense2csr(handle, m, n, descrA, A, lda, - nnzPerRow, csrValA, csrRowPtrA, csrColIndA): - # Unfinished - pass - gtsv2StridedBatch_bufferSizeExt_doc = Template( """ Calculate size of work buffer used by cusparsegtsv2StridedBatch. From 263ff24423f298ef726287b6a3b398d7e7bddf88 Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sun, 12 Apr 2020 14:27:47 -0700 Subject: [PATCH 12/13] Remove unused imports in cusparse --- skcuda/cusparse.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index 164155f8..b700401e 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -8,13 +8,10 @@ from __future__ import absolute_import -import ctypes.util +import ctypes import platform from string import Template import sys -import warnings - -import numpy as np from . import cuda From 13a5fdd10252df792a324dfa33139f4e22d30adb Mon Sep 17 00:00:00 2001 From: Ryan Gutenkunst Date: Sat, 2 May 2020 15:22:17 -0700 Subject: [PATCH 13/13] Remove more unused code in cusparse --- skcuda/cusparse.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/skcuda/cusparse.py b/skcuda/cusparse.py index b700401e..a8442b6e 100644 --- a/skcuda/cusparse.py +++ b/skcuda/cusparse.py @@ -96,41 +96,6 @@ class cusparseStatusMatrixTypeNotSupported(cusparseError): 8: cusparseStatusMatrixTypeNotSupported, } -# Matrix types: -CUSPARSE_MATRIX_TYPE_GENERAL = 0 -CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 -CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 -CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 - -CUSPARSE_FILL_MODE_LOWER = 0 -CUSPARSE_FILL_MODE_UPPER = 1 - -# Whether or not a matrix' diagonal entries are unity: -CUSPARSE_DIAG_TYPE_NON_UNIT = 0 -CUSPARSE_DIAG_TYPE_UNIT = 1 - -# Matrix index bases: -CUSPARSE_INDEX_BASE_ZERO = 0 -CUSPARSE_INDEX_BASE_ONE = 1 - -# Operation types: -CUSPARSE_OPERATION_NON_TRANSPOSE = 0 -CUSPARSE_OPERATION_TRANSPOSE = 1 -CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 - -# Whether or not to parse elements of a dense matrix row or column-wise. -CUSPARSE_DIRECTION_ROW = 0 -CUSPARSE_DIRECTION_COLUMN = 1 - -# Helper functions: -class cusparseMatDescr(ctypes.Structure): - _fields_ = [ - ('MatrixType', ctypes.c_int), - ('FillMode', ctypes.c_int), - ('DiagType', ctypes.c_int), - ('IndexBase', ctypes.c_int) - ] - def cusparseCheckStatus(status): """ Raise CUSPARSE exception