Skip to content

Commit 35d0474

Browse files
committed
Minor doc fixes
1 parent 0c998ae commit 35d0474

File tree

2 files changed

+31
-16
lines changed

2 files changed

+31
-16
lines changed

docs/src/api.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
```@docs
66
@turbo
7-
@_avx
7+
@tturbo
88
```
99

1010
## `map`-like constructs
@@ -24,3 +24,10 @@ vmapntt!
2424
vfilter
2525
LoopVectorization.vfilter!
2626
```
27+
28+
## `reduce`-like constructs
29+
```@docs
30+
vreduce
31+
vmapreduce
32+
```
33+

docs/src/examples/multithreading.md

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ relatively primitive arithmetic operations (e.g. `+`, `/`, or `log`), and not, f
3232

3333
I'll make comparisons with OpenMP through the rest of this, starting with a simple dot product to focus on threading overhead:
3434
```julia
35-
function dotavxt(a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Real}
35+
function dot_tturbo(a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Real}
3636
s = zero(T)
3737
@tturbo for i eachindex(a,b)
3838
s += a[i] * b[i]
@@ -70,19 +70,19 @@ Trying out one size to give a perspective on scale:
7070
```julia
7171
julia> N = 10_000; x = rand(N); y = rand(N);
7272

73-
julia> @btime dot($x, $y)
73+
julia> @btime dot($x, $y) # LinearAlgebra
7474
1.114 μs (0 allocations: 0 bytes)
7575
2480.296446711209
7676

77-
julia> @btime dotavx($x, $y)
77+
julia> @btime dot_turbo($x, $y)
7878
761.621 ns (0 allocations: 0 bytes)
7979
2480.296446711209
8080

81-
julia> @btime dotavxt($x, $y)
81+
julia> @btime dot_tturbo($x, $y)
8282
622.723 ns (0 allocations: 0 bytes)
8383
2480.296446711209
8484

85-
julia> @btime dotbaseline($x, $y)
85+
julia> @btime dot_baseline($x, $y)
8686
1.294 μs (0 allocations: 0 bytes)
8787
2480.2964467112097
8888

@@ -96,11 +96,11 @@ All these times are fairly fast; `wait(Threads.@spawn 1+1)` will typically take
9696

9797
Now let's look at a more complex example:
9898
```julia
99-
function dotavxt(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
99+
function dot_tturbo(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
100100
a = reinterpret(reshape, T, ca)
101101
b = reinterpret(reshape, T, cb)
102102
re = zero(T); im = zero(T)
103-
@turbo for i axes(a,2) # adjoint(a[i]) * b[i]
103+
@tturbo for i axes(a,2) # adjoint(a[i]) * b[i]
104104
re += a[1,i] * b[1,i] + a[2,i] * b[2,i]
105105
im += a[1,i] * b[2,i] - a[2,i] * b[1,i]
106106
end
@@ -139,15 +139,23 @@ and as we have an array of structs rather than structs of arrays, we need additi
139139

140140
If we take this further to the three-argument dot product, which isn't implemented in BLAS, `@tturbo` now holds a substantial advantage over the competition:
141141
```julia
142-
function dotavxt(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
143-
a = reinterpret(reshape, T, ca)
144-
b = reinterpret(reshape, T, cb)
145-
re = zero(T); im = zero(T)
146-
@tturbo for i axes(a,2) # adjoint(a[i]) * b[i]
147-
re += a[1,i] * b[1,i] + a[2,i] * b[2,i]
148-
im += a[1,i] * b[2,i] - a[2,i] * b[1,i]
142+
function dot3(x::AbstractVector{Complex{T}}, A::AbstractMatrix{Complex{T}}, y::AbstractVector{Complex{T}}) where {T}
143+
xr = reinterpret(reshape, T, x);
144+
yr = reinterpret(reshape, T, y);
145+
Ar = reinterpret(reshape, T, A);
146+
sre = zero(T)
147+
sim = zero(T)
148+
@tturbo for n in axes(Ar,3)
149+
tre = zero(T)
150+
tim = zero(T)
151+
for m in axes(Ar,2)
152+
tre += xr[1,m] * Ar[1,m,n] + xr[2,m] * Ar[2,m,n]
153+
tim += xr[1,m] * Ar[2,m,n] - xr[2,m] * Ar[1,m,n]
154+
end
155+
sre += tre * yr[1,n] - tim * yr[2,n]
156+
sim += tre * yr[2,n] + tim * yr[1,n]
149157
end
150-
Complex(re, im)
158+
Complex(sre, sim)
151159
end
152160
```
153161

0 commit comments

Comments
 (0)