Skip to content

Commit c8ebb24

Browse files
committed
Tests passed locally on Julia master (will have to rerun CI once VectorizationBase and SLEEFPirates are released.
1 parent 937fffe commit c8ebb24

19 files changed

+577
-429
lines changed

.github/workflows/ci.yml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
name: CI
2+
on:
3+
pull_request:
4+
branches:
5+
- master
6+
push:
7+
branches:
8+
- master
9+
tags: '*'
10+
jobs:
11+
test:
12+
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
13+
runs-on: ${{ matrix.os }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
version:
18+
- '1.4'
19+
- '1'
20+
- 'nightly'
21+
os:
22+
- ubuntu-latest
23+
arch:
24+
- x64
25+
steps:
26+
- uses: actions/checkout@v2
27+
- uses: julia-actions/setup-julia@v1
28+
with:
29+
version: ${{ matrix.version }}
30+
arch: ${{ matrix.arch }}
31+
- uses: actions/cache@v1
32+
env:
33+
cache-name: cache-artifacts
34+
with:
35+
path: ~/.julia/artifacts
36+
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
37+
restore-keys: |
38+
${{ runner.os }}-test-${{ env.cache-name }}-
39+
${{ runner.os }}-test-
40+
${{ runner.os }}-
41+
- uses: julia-actions/julia-buildpkg@v1
42+
- uses: julia-actions/julia-runtest@v1
43+
- uses: julia-actions/julia-processcoverage@v1
44+
- uses: codecov/codecov-action@v1
45+
with:
46+
file: lcov.info
47+
docs:
48+
name: Documentation
49+
runs-on: ubuntu-latest
50+
steps:
51+
- uses: actions/checkout@v2
52+
- uses: julia-actions/setup-julia@v1
53+
with:
54+
version: '1'
55+
- run: |
56+
julia --project=docs -e '
57+
using Pkg
58+
Pkg.develop(PackageSpec(path=pwd()))
59+
Pkg.instantiate()'
60+
- run: |
61+
julia --project=docs -e '
62+
using Documenter: doctest
63+
using LoopVectorization
64+
doctest(LoopVectorization)'
65+
- run: julia --project=docs docs/make.jl
66+
env:
67+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
68+
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}

.travis.yml

Lines changed: 0 additions & 31 deletions
This file was deleted.

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.26"
4+
version = "0.9.0"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -14,14 +14,14 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1414
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1515

1616
[compat]
17-
ArrayInterface = "2.13.8"
17+
ArrayInterface = "2.14"
1818
DocStringExtensions = "0.8"
1919
IfElse = "0"
2020
OffsetArrays = "1"
2121
SLEEFPirates = "0.6"
2222
UnPack = "0,1"
2323
VectorizationBase = "0.13"
24-
julia = "1.3"
24+
julia = "1.4"
2525

2626
[extras]
2727
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

benchmark/looptests.jl

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,24 @@
1-
using LoopVectorization, LinearAlgebra, OffsetArrays
1+
using LoopVectorization, LinearAlgebra, OffsetArrays, ArrayInterface
22
BLAS.set_num_threads(1)
33

4-
using LoopVectorization.VectorizationBase: StaticUnitRange
5-
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: DenseMatrix{T}
4+
# TODO: remove this once this PR merges: https://github.com/JuliaArrays/OffsetArrays.jl/pull/170
5+
@inline Base.unsafe_convert(::Type{Ptr{T}}, A::OffsetArray{T}) where {T} = pointer(parent(A))
6+
7+
struct SizedOffsetMatrix{T,LR,UR,LC,UC} <: DenseMatrix{T}
68
data::Matrix{T}
79
end
8-
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
10+
Base.size(::SizedOffsetMatrix{<:Any,LR,UR,LC,UC}) where {LR,UR,LC,UC} = (UR-LR+1,UC-LC+1)
11+
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}():Static{UR}(),Static{LC}():Static{UC}())
912
Base.parent(A::SizedOffsetMatrix) = A.data
10-
@generated function LoopVectorization.stridedpointer(A::SizedOffsetMatrix{T,LR,UR,LC,RC}) where {T,LR,UR,LC,RC}
11-
quote
12-
$(Expr(:meta,:inline))
13-
LoopVectorization.OffsetStridedPointer(
14-
LoopVectorization.StaticStridedPointer{$T,Tuple{1,$(UR-LR+1)}}(pointer(parent(A))),
15-
($(LR-1), $(LC-1))
16-
)
17-
end
18-
end
19-
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i-1,j-1))
20-
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
21-
Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
22-
Base.unsafe_convert(::Type{Ptr{Float64}}, A::SizedOffsetMatrix) = Base.unsafe_convert(Ptr{Float64}, A.data)
13+
Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = pointer(A.data)
14+
ArrayInterface.contiguous_axis(::Type{<:SizedOffsetMatrix}) = ArrayInterface.Contiguous{1}()
15+
ArrayInterface.contiguous_batch_size(::Type{<:SizedOffsetMatrix}) = ArrayInterface.ContiguousBatch{0}()
16+
ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = ArrayInterface.StrideRank{(1,2)}()
17+
function ArrayInterface.strides(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC}
18+
(Static{1}(), (Static{UR}() - Static{LR}() + Static{1}()))
19+
end
20+
ArrayInterface.offsets(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}(), Static{LC}())
21+
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j))
2322

2423

2524
function jgemm!(𝐂, 𝐀, 𝐁)

src/LoopVectorization.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import LinearAlgebra # for check_args
3333
using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast
3434

3535
using ArrayInterface
36-
using ArrayInterface: OptionallyStaticUnitRange, Zero
36+
using ArrayInterface: OptionallyStaticUnitRange, Zero, One
3737
const Static = ArrayInterface.StaticInt
3838

3939

@@ -96,7 +96,7 @@ loop-reordering so as to improve performance:
9696
"""
9797
LoopVectorization
9898

99-
# include("precompile.jl")
100-
# _precompile_()
99+
include("precompile.jl")
100+
_precompile_()
101101

102102
end # module

src/broadcast.jl

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,41 @@
11

22
@inline stridedpointer_for_broadcast(A) = stridedpointer_for_broadcast(ArrayInterface.size(A), stridedpointer(A))
33
@inline stridedpointer_for_broadcast(s, ptr) = ptr
4-
function stridedpointer_for_broadcast(s, ptr::VectorizationBase.AbstractStridedPointer)
5-
# FIXME: this is unsafe for AbstractStridedPointers
6-
throw("Broadcasting not currently supported for arrays where typeof(stridedpointer(A)) === $(typeof(ptr))")
7-
end
8-
@generated function stridedpointer_for_broadcast(s::Tuple{Vararg{Any,N}}, ptr::StridedPointer{T,N,C,B,R,X,O}) where {T,N,C,B,R,X,O}
4+
# function stridedpointer_for_broadcast(s, ptr::VectorizationBase.AbstractStridedPointer)
5+
# # FIXME: this is unsafe for AbstractStridedPointers
6+
# throw("Broadcasting not currently supported for arrays where typeof(stridedpointer(A)) === $(typeof(ptr))")
7+
# end
8+
function stridedpointer_for_broadcast_quote(typ, N, S, X)
99
q = Expr(:block, Expr(:meta,:inline), :(strd = ptr.strd))
1010
strd_tup = Expr(:tuple)
1111
for n 1:N
12-
s_type = s.parameters[n]
12+
s_type = S[n]
1313
if s_type <: Static
1414
if s_type === Static{1}
1515
push!(strd_tup.args, Expr(:call, lv(:Zero)))
1616
else
1717
push!(strd_tup.args, :(strd[$n]))
1818
end
1919
else
20-
Xₙ_type = X.parameters[n]
20+
Xₙ_type = X[n]
2121
if Xₙ_type <: Static # FIXME; what to do here? Dynamic dispatch?
2222
push!(strd_tup.args, :(strd[$n]))
2323
else
24-
push!(strd_tup.args, :(Base.ifelse(isone(s[$n]), one($Xₙ_type), strd[$n])))
24+
push!(strd_tup.args, :(Base.ifelse(isone(s[$n]), zero($Xₙ_type), strd[$n])))
2525
end
2626
end
2727
end
28-
push!(q.args, :(@inbounds StridedPointer{$T,$N,$C,$B,$R}(ptr.p, $strd_tup, ptr.offsets)))
28+
push!(q.args, :(@inbounds $typ(ptr.p, $strd_tup, ptr.offsets)))
2929
q
3030
end
31+
@generated function stridedpointer_for_broadcast(s::Tuple{Vararg{Any,N}}, ptr::StridedPointer{T,N,C,B,R,X,O}) where {T,N,C,B,R,X,O}
32+
typ = Expr(:curly, :StridedPointer, T, N, C, B, R)
33+
stridedpointer_for_broadcast_quote(typ, N, s.parameters, X.parameters)
34+
end
35+
@generated function stridedpointer_for_broadcast(s::Tuple{Vararg{Any,N}}, ptr::VectorizationBase.StridedBitPointer{N,C,B,R,X,O}) where {N,C,B,R,X,O}
36+
typ = Expr(:curly, :StridedBitPointer, N, C, B, R)
37+
stridedpointer_for_broadcast_quote(typ, N, s.parameters, X.parameters)
38+
end
3139

3240
struct Product{A,B}
3341
a::A
@@ -132,8 +140,9 @@ function add_broadcast!(
132140
push!(ls.preamble_zeros, (identifier(setC), IntOrFloat))
133141
setC.reduced_children = kvec
134142
# compute Cₘₙ += Aₘₖ * Bₖₙ
143+
instrsym = Base.libllvm_version < v"11.0.0" ? :vfmadd231 : :vfmadd
135144
reductop = Operation(
136-
ls, mC, elementbytes, :vfmadd231, compute, reductdeps, kvec, Operation[loadA, loadB, setC]
145+
ls, mC, elementbytes, instrsym, compute, reductdeps, kvec, Operation[loadA, loadB, setC]
137146
)
138147
reductop = pushop!(ls, reductop, mC)
139148
reductfinal = Operation(
@@ -149,17 +158,18 @@ Base.@propagate_inbounds Base.getindex(A::LowDimArray, i...) = getindex(A.data,
149158
@inline Base.size(A::LowDimArray) = Base.size(A.data)
150159
@inline Base.size(A::LowDimArray, i) = Base.size(A.data, i)
151160
@inline Base.strides(A::LowDimArray) = strides(A.data)
161+
@inline ArrayInterface.parent_type(::Type{LowDimArray{D,T,N,A}}) where {T,D,N,A} = A
152162
@inline ArrayInterface.strides(A::LowDimArray) = ArrayInterface.strides(A.data)
153163
@generated function ArrayInterface.size(A::LowDimArray{D,T,N}) where {D,T,N}
154164
t = Expr(:tuple)
155165
for n 1:N
156-
if D[n]
166+
if n > length(D) || D[n]
157167
push!(t.args, Expr(:ref, :s, n))
158168
else
159169
push!(t.args, Expr(:call, Expr(:curly, lv(:Static), 1)))
160170
end
161171
end
162-
Expr(:block, Expr(:meta,:inline), :(s = size(A)), t)
172+
Expr(:block, Expr(:meta,:inline), :(s = ArrayInterface.size(parent(A))), t)
163173
end
164174
Base.parent(A::LowDimArray) = A.data
165175
Base.unsafe_convert(::Type{Ptr{T}}, A::LowDimArray{D,T}) where {D,T} = pointer(A.data)
@@ -168,6 +178,35 @@ ArrayInterface.contiguous_batch_size(A::LowDimArray) = ArrayInterface.contiguous
168178
ArrayInterface.stride_rank(A::LowDimArray) = ArrayInterface.stride_rank(A.data)
169179
ArrayInterface.offsets(A::LowDimArray) = ArrayInterface.offsets(A.data)
170180

181+
@inline function stridedpointer_for_broadcast(A::LowDimArray{D}) where {D}
182+
_stridedpointer(stridedpointer_for_broadcast(parent(A)), Val{D}())
183+
end
184+
185+
@generated function _stridedpointer(p::StridedPointer{T,N,C,B,R}, ::Val{D}) where {T,N,C,B,R,D}
186+
lenD = length(D)
187+
strd = Expr(:tuple)
188+
offsets = Expr(:tuple)
189+
Rtup = Expr(:tuple)
190+
Cnew = -1
191+
Bnew = -1
192+
Nnew = 0
193+
for n 1:N
194+
((n lenD) && (!D[n])) && continue
195+
if n == C
196+
Cnew = n
197+
end
198+
if n == B
199+
Bnew = n
200+
end
201+
push!(Rtup.args, R[n])
202+
push!(offsets.args, Expr(:ref, :offs, n))
203+
push!(strd.args, Expr(:ref, :strd, n))
204+
Nnew += 1
205+
end
206+
typ = Expr(:curly, :StridedPointer, T, Nnew, Cnew, Bnew, Rtup)
207+
ptr = Expr(:call, typ, :(pointer(p)), strd, offsets)
208+
Expr(:block, Expr(:meta,:inline), :(strd = p.strd), :(offs = p.offsets), ptr)
209+
end
171210
# @generated function VectorizationBase.stridedpointer(A::LowDimArray{D,T,N}) where {D,T,N}
172211
# smul = Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), QuoteNode(:staticmul))
173212
# multup = Expr(:tuple)
@@ -188,22 +227,22 @@ function LowDimArray{D}(data::A) where {D,T,N,A <: AbstractArray{T,N}}
188227
end
189228
function extract_all_1_array!(ls::LoopSet, bcname::Symbol, N::Int, elementbytes::Int)
190229
refextract = gensym(bcname)
191-
ref = Expr(:ref, bcname); append!(ref.args, [1 for n 1:N])
230+
ref = Expr(:ref, bcname); foreach(_ -> push!(ref.args, :begin), 1:N)
192231
pushprepreamble!(ls, Expr(:(=), refextract, ref))
193232
return add_constant!(ls, refextract, elementbytes) # or replace elementbytes with sizeof(T) ? u
194233
end
195234
function add_broadcast!(
196235
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
197-
@nospecialize(LDA::Type{<:LowDimArray}), elementbytes::Int
198-
)
199-
D,T,N::Int,_ = LDA.parameters
236+
@nospecialize(LDA::Type{LowDimArray{D,T,N,A}}), elementbytes::Int
237+
) where {D,T,N,A}
238+
# D,T,N::Int,_ = LDA.parameters
200239
Dlen = length(D)
201-
if Dlen == N && !any(D)
240+
if Dlen == N && !any(D) # array is a scalar, as it is broadcasted on all dimensions
202241
return extract_all_1_array!(ls, bcname, N, elementbytes)
203242
end
204243
fulldims = Symbol[loopsyms[n] for n 1:N if ((Dlen < n) || D[n]::Bool)]
205244
ref = ArrayReference(bcname, fulldims)
206-
add_simple_load!(ls, destname, ref, elementbytes, true, false )::Operation
245+
add_simple_load!(ls, destname, ref, elementbytes, true, true )::Operation
207246
end
208247
function add_broadcast_adjoint_array!(
209248
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{A}, elementbytes::Int
@@ -218,8 +257,11 @@ function add_broadcast_adjoint_array!(
218257
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{<:AbstractVector}, elementbytes::Int
219258
)
220259
# isone(length(loopsyms)) && return extract_all_1_array!(ls, bcname, N, elementbytes)
221-
ref = ArrayReference(bcname, Symbol[loopsyms[2]])
222-
add_simple_load!( ls, destname, ref, elementbytes, true, true )
260+
parent = gensym(:parent)
261+
pushprepreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
262+
263+
ref = ArrayReference(parent, Symbol[loopsyms[2]])
264+
add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
223265
end
224266
function add_broadcast!(
225267
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},

src/condense_loopset.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,9 @@ end
294294
make_fast(q) = Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
295295
make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
296296

297+
@inline vecmemaybe(x::NativeTypes) = x
298+
@inline vecmemaybe(x::VectorizationBase._Vec) = Vec(x)
299+
297300
function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
298301
call = generate_call(ls, (inline,U,T))
299302
noouterreductions = iszero(length(ls.outer_reductions))
@@ -313,7 +316,7 @@ function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zer
313316
instr = instruction(op)
314317
out = Symbol(mvar, 0)
315318
push!(outer_reducts.args, out)
316-
push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), Expr(:call, lv(:Vec), out), var)))
319+
push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), Expr(:call, lv(:vecmemaybe), out), var)))
317320
end
318321
pushpreamble!(ls, outer_reducts)
319322
append!(ls.preamble.args, q.args)

0 commit comments

Comments
 (0)