@@ -5,6 +5,29 @@ import LinearAlgebra: mul!
5
5
using SparseArrays
6
6
import SparseArrays: AdjOrTransStridedOrTriangularMatrix, getcolptr
7
7
8
+ # * Threading utilities
9
+ struct RangeIterator
10
+ k:: Int
11
+ d:: Int
12
+ r:: Int
13
+ end
14
+
15
+ """
16
+ RangeIterator(n::Int,k::Int)
17
+
18
+ Returns an iterator splitting the range `1:n` into `min(k,n)` parts of (almost) equal size.
19
+ """
20
+ RangeIterator (n:: Int ,k:: Int ) = RangeIterator (min (n,k),divrem (n,k)... )
21
+ Base. length (it:: RangeIterator ) = it. k
22
+ # Base.iterate(it::RangeIterator, i::Int=1) = i>it.k ? nothing : (((i-1)*it.d+min(i-1,it.r)+1):(i*it.d+min(i,it.r)), i+1)
23
+ function Base. iterate (it:: RangeIterator , i:: Int = 1 )
24
+ i> it. k && return nothing
25
+ e = i* it. d + min (i,it. r)
26
+ s = e - it. d + (i> it. r)
27
+ (s: e,i+ 1 )
28
+ end
29
+
30
+
8
31
# * ThreadedSparseMatrixCSC
9
32
10
33
"""
@@ -39,11 +62,13 @@ function mul!(C::StridedVecOrMat, A::ThreadedSparseMatrixCSC, B::Union{StridedVe
39
62
if β != 1
40
63
β != 0 ? rmul! (C, β) : fill! (C, zero (eltype (C)))
41
64
end
42
- Threads. @threads for k = 1 : size (C, 2 )
43
- @inbounds for col = 1 : size (A, 2 )
44
- αxj = B[col,k] * α
45
- for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
46
- C[rv[j], k] += nzv[j]* αxj
65
+ @sync for r in RangeIterator (size (C,2 ), Threads. nthreads ())
66
+ Threads. @spawn for k in r
67
+ @inbounds for col = 1 : size (A, 2 )
68
+ αxj = B[col,k] * α
69
+ for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
70
+ C[rv[j], k] += nzv[j]* αxj
71
+ end
47
72
end
48
73
end
49
74
end
@@ -61,13 +86,15 @@ function mul!(C::StridedVecOrMat, adjA::Adjoint{<:Any,<:ThreadedSparseMatrixCSC}
61
86
if β != 1
62
87
β != 0 ? rmul! (C, β) : fill! (C, zero (eltype (C)))
63
88
end
64
- Threads. @threads for k = 1 : size (C, 2 )
65
- @inbounds for col = 1 : size (A, 2 )
66
- tmp = zero (eltype (C))
67
- for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
68
- tmp += adjoint (nzv[j])* B[rv[j],k]
89
+ @sync for r in RangeIterator (size (C,2 ), Threads. nthreads ())
90
+ Threads. @spawn for k in r
91
+ @inbounds for col = 1 : size (A, 2 )
92
+ tmp = zero (eltype (C))
93
+ for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
94
+ tmp += adjoint (nzv[j])* B[rv[j],k]
95
+ end
96
+ C[col,k] += tmp * α
69
97
end
70
- C[col,k] += tmp * α
71
98
end
72
99
end
73
100
C
@@ -83,13 +110,15 @@ function mul!(C::StridedVecOrMat, transA::Transpose{<:Any,<:ThreadedSparseMatrix
83
110
if β != 1
84
111
β != 0 ? rmul! (C, β) : fill! (C, zero (eltype (C)))
85
112
end
86
- Threads. @threads for k = 1 : size (C, 2 )
87
- @inbounds for col = 1 : size (A, 2 )
88
- tmp = zero (eltype (C))
89
- for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
90
- tmp += transpose (nzv[j])* B[rv[j],k]
113
+ @sync for r in RangeIterator (size (C,2 ), Threads. nthreads ())
114
+ Threads. @spawn for k in r
115
+ @inbounds for col = 1 : size (A, 2 )
116
+ tmp = zero (eltype (C))
117
+ for j = getcolptr (A)[col]: (getcolptr (A)[col + 1 ] - 1 )
118
+ tmp += transpose (nzv[j])* B[rv[j],k]
119
+ end
120
+ C[col,k] += tmp * α
91
121
end
92
- C[col,k] += tmp * α
93
122
end
94
123
end
95
124
C
@@ -105,21 +134,17 @@ function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, A::Thr
105
134
if β != 1
106
135
β != 0 ? rmul! (C, β) : fill! (C, zero (eltype (C)))
107
136
end
108
- # Threads.@threads for col = 1:size(A, 2)
109
- # @inbounds for multivec_row=1:mX, k=getcolptr(A)[col]:(getcolptr(A)[col+1]-1)
110
- # C[multivec_row, col] += α * X[multivec_row, rv[k]] * nzv[k] # perhaps suboptimal position of α?
111
- # end
112
- # end
113
- Threads. @threads for col = 1 : size (A, 2 )
114
- @inbounds for k= getcolptr (A)[col]: (getcolptr (A)[col+ 1 ]- 1 )
115
- j = rv[k]
116
- αv = nzv[k]* α
117
- for multivec_row= 1 : mX
118
- C[multivec_row, col] += X[multivec_row, j] * αv
137
+ @sync for r in RangeIterator (size (A,2 ), Threads. nthreads ())
138
+ Threads. @spawn for col in r
139
+ @inbounds for k= getcolptr (A)[col]: (getcolptr (A)[col+ 1 ]- 1 )
140
+ j = rv[k]
141
+ αv = nzv[k]* α
142
+ for multivec_row= 1 : mX
143
+ C[multivec_row, col] += X[multivec_row, j] * αv
144
+ end
119
145
end
120
146
end
121
147
end
122
-
123
148
C
124
149
end
125
150
0 commit comments