Skip to content

Commit 1f5abea

Browse files
authored
Merge pull request #157 from chriselrod/support1.6
WIP: Minimal Changes for 1.6 support
2 parents 84de1a6 + c8ebb24 commit 1f5abea

40 files changed

+1291
-837
lines changed

.github/workflows/ci.yml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
name: CI
2+
on:
3+
pull_request:
4+
branches:
5+
- master
6+
push:
7+
branches:
8+
- master
9+
tags: '*'
10+
jobs:
11+
test:
12+
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
13+
runs-on: ${{ matrix.os }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
version:
18+
- '1.4'
19+
- '1'
20+
- 'nightly'
21+
os:
22+
- ubuntu-latest
23+
arch:
24+
- x64
25+
steps:
26+
- uses: actions/checkout@v2
27+
- uses: julia-actions/setup-julia@v1
28+
with:
29+
version: ${{ matrix.version }}
30+
arch: ${{ matrix.arch }}
31+
- uses: actions/cache@v1
32+
env:
33+
cache-name: cache-artifacts
34+
with:
35+
path: ~/.julia/artifacts
36+
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
37+
restore-keys: |
38+
${{ runner.os }}-test-${{ env.cache-name }}-
39+
${{ runner.os }}-test-
40+
${{ runner.os }}-
41+
- uses: julia-actions/julia-buildpkg@v1
42+
- uses: julia-actions/julia-runtest@v1
43+
- uses: julia-actions/julia-processcoverage@v1
44+
- uses: codecov/codecov-action@v1
45+
with:
46+
file: lcov.info
47+
docs:
48+
name: Documentation
49+
runs-on: ubuntu-latest
50+
steps:
51+
- uses: actions/checkout@v2
52+
- uses: julia-actions/setup-julia@v1
53+
with:
54+
version: '1'
55+
- run: |
56+
julia --project=docs -e '
57+
using Pkg
58+
Pkg.develop(PackageSpec(path=pwd()))
59+
Pkg.instantiate()'
60+
- run: |
61+
julia --project=docs -e '
62+
using Documenter: doctest
63+
using LoopVectorization
64+
doctest(LoopVectorization)'
65+
- run: julia --project=docs docs/make.jl
66+
env:
67+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
68+
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}

.travis.yml

Lines changed: 0 additions & 31 deletions
This file was deleted.

Project.toml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.26"
4+
version = "0.9.0"
55

66
[deps]
7+
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
78
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
9+
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
810
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
911
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10-
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
1112
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
1213
UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1314
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1415

1516
[compat]
17+
ArrayInterface = "2.14"
1618
DocStringExtensions = "0.8"
19+
IfElse = "0"
1720
OffsetArrays = "1"
18-
SIMDPirates = "0.8.25"
19-
SLEEFPirates = "0.5.4"
21+
SLEEFPirates = "0.6"
2022
UnPack = "0,1"
21-
VectorizationBase = "0.12.31"
22-
julia = "1.1"
23+
VectorizationBase = "0.13"
24+
julia = "1.4"
2325

2426
[extras]
2527
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

benchmark/looptests.jl

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,24 @@
1-
using LoopVectorization, LinearAlgebra, OffsetArrays
1+
using LoopVectorization, LinearAlgebra, OffsetArrays, ArrayInterface
22
BLAS.set_num_threads(1)
33

4-
using LoopVectorization.VectorizationBase: StaticUnitRange
5-
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: DenseMatrix{T}
4+
# TODO: remove this once this PR merges: https://github.com/JuliaArrays/OffsetArrays.jl/pull/170
5+
@inline Base.unsafe_convert(::Type{Ptr{T}}, A::OffsetArray{T}) where {T} = pointer(parent(A))
6+
7+
struct SizedOffsetMatrix{T,LR,UR,LC,UC} <: DenseMatrix{T}
68
data::Matrix{T}
79
end
8-
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
10+
Base.size(::SizedOffsetMatrix{<:Any,LR,UR,LC,UC}) where {LR,UR,LC,UC} = (UR-LR+1,UC-LC+1)
11+
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}():Static{UR}(),Static{LC}():Static{UC}())
912
Base.parent(A::SizedOffsetMatrix) = A.data
10-
@generated function LoopVectorization.stridedpointer(A::SizedOffsetMatrix{T,LR,UR,LC,RC}) where {T,LR,UR,LC,RC}
11-
quote
12-
$(Expr(:meta,:inline))
13-
LoopVectorization.OffsetStridedPointer(
14-
LoopVectorization.StaticStridedPointer{$T,Tuple{1,$(UR-LR+1)}}(pointer(parent(A))),
15-
($(LR-1), $(LC-1))
16-
)
17-
end
18-
end
19-
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i-1,j-1))
20-
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
21-
Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
22-
Base.unsafe_convert(::Type{Ptr{Float64}}, A::SizedOffsetMatrix) = Base.unsafe_convert(Ptr{Float64}, A.data)
13+
Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = pointer(A.data)
14+
ArrayInterface.contiguous_axis(::Type{<:SizedOffsetMatrix}) = ArrayInterface.Contiguous{1}()
15+
ArrayInterface.contiguous_batch_size(::Type{<:SizedOffsetMatrix}) = ArrayInterface.ContiguousBatch{0}()
16+
ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = ArrayInterface.StrideRank{(1,2)}()
17+
function ArrayInterface.strides(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC}
18+
(Static{1}(), (Static{UR}() - Static{LR}() + Static{1}()))
19+
end
20+
ArrayInterface.offsets(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}(), Static{LC}())
21+
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j))
2322

2423

2524
function jgemm!(𝐂, 𝐀, 𝐁)

docs/src/devdocs/loopset_structure.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ julia> LoopVectorization.operations(lsAmulB)
2626
var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
2727
var"##RHS#256" = LoopVectorization.reduce_to_add(var"##reduction#260", var"##RHS#256")
2828
```
29-
The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `SVec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
29+
The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `Vec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
3030

3131
Each operation is listed as depending on a set of loop iteration symbols:
3232
```julia

docs/src/devdocs/lowering.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ This task is made simpler via multiple dispatch making the lowering of the compo
55
```julia
66
vload(vptr_A, (i,j,k))
77
```
8-
with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `SVec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
8+
with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `Vec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
99

1010

src/LoopVectorization.jl

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
module LoopVectorization
22

3-
if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
4-
@eval Base.Experimental.@optlevel 1
5-
end
6-
7-
using VectorizationBase, SIMDPirates, SLEEFPirates, UnPack, OffsetArrays
8-
using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr,
9-
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valmulsub, valadd, valsub, _MM,
10-
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, subsetview, vzero, stridedpointer_for_broadcast,
11-
Static, Zero, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
12-
AbstractColumnMajorStridedPointer, AbstractRowMajorStridedPointer, AbstractSparseStridedPointer, AbstractStaticStridedPointer,
13-
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct, offsetprecalc,
14-
maybestaticfirst, maybestaticlast, scalar_less, scalar_greater, noalias!, gesp, gepbyte, pointerforcomparison, NativeTypes, staticmul, staticmuladd
15-
using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange,
16-
reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
17-
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
18-
vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
19-
vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
3+
# if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
4+
# @eval Base.Experimental.@optlevel 1
5+
# end
6+
7+
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
8+
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
9+
mask, pick_vector_width_val, MM,
10+
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
11+
Zero, maybestaticrange, offsetprecalc, lazymul,
12+
maybestaticfirst, maybestaticlast, scalar_less, gep, gesp, pointerforcomparison, NativeTypes,
13+
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, vadd, vsub, vmul,
14+
relu, stridedpointer, StridedPointer, AbstractStridedPointer,
15+
reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
16+
vsum, vprod, vmaximum, vminimum, vstorent!
17+
18+
using IfElse: ifelse
19+
20+
# missing: stridedpointer_for_broadcast, noalias!, gepbyte,
21+
# using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange,
22+
# reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
23+
# sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
24+
# vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
25+
# vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
2026
using SLEEFPirates: pow
2127
using Base.Broadcast: Broadcasted, DefaultArrayStyle
2228
using LinearAlgebra: Adjoint, Transpose
@@ -26,27 +32,28 @@ import LinearAlgebra # for check_args
2632

2733
using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast
2834

35+
using ArrayInterface
36+
using ArrayInterface: OptionallyStaticUnitRange, Zero, One
37+
const Static = ArrayInterface.StaticInt
38+
39+
40+
2941
export LowDimArray, stridedpointer,
3042
@avx, @_avx, *ˡ, _avx_!,
3143
vmap, vmap!, vmapt, vmapt!, vmapnt, vmapnt!, vmapntt, vmapntt!,
3244
vfilter, vfilter!, vmapreduce, vreduce
3345

46+
@inline unwrap(::Val{N}) where {N} = N
47+
@inline unwrap(::Static{N}) where {N} = N
48+
@inline unwrap(x) = x
49+
3450
const VECTORWIDTHSYMBOL, ELTYPESYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##")
3551

36-
"""
37-
REGISTER_COUNT defined in VectorizationBase is supposed to correspond to the actual number of floating point registers on the system.
38-
It is hardcoded into a file at build time.
39-
However, someone may have multiple builds of Julia on the same system, some 32-bit and some 64-bit (e.g., they use 64-bit primarilly,
40-
but keep a 32-bit build on hand to debug test failures on Appveyor's 32-bit build). Thus, we don't want REGISTER_COUNT to be hardcoded
41-
in such a fashion.
42-
32-bit builds are limited to only 8 floating point registers, so we take care of that here.
43-
44-
If you want good performance, DO NOT use a 32-bit build of Julia if you don't have to.
45-
"""
46-
const REGISTER_COUNT = Sys.ARCH === :i686 ? 8 : VectorizationBase.REGISTER_COUNT
4752

53+
include("vectorizationbase_compat/contract_pass.jl")
54+
include("vectorizationbase_compat/subsetview.jl")
4855
include("getconstindexes.jl")
49-
include("vectorizationbase_extensions.jl")
56+
# include("vectorizationbase_extensions.jl")
5057
include("predicates.jl")
5158
include("map.jl")
5259
include("filter.jl")

src/add_compute.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -328,9 +328,9 @@ function add_pow!(
328328
return add_compute!(ls, var, :^, [xop, pop], elementbytes)
329329
end
330330
if pint == -1
331-
return add_compute!(ls, var, :vinv, [xop], elementbytes)
331+
return add_compute!(ls, var, :inv, [xop], elementbytes)
332332
elseif pint < 0
333-
xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes)
333+
xop = add_compute!(ls, gensym(:inverse), :inv, [xop], elementbytes)
334334
pint = - pint
335335
end
336336
if pint == 0
@@ -340,22 +340,22 @@ function add_pow!(
340340
elseif pint == 1
341341
return add_compute!(ls, var, :identity, [xop], elementbytes)
342342
elseif pint == 2
343-
return add_compute!(ls, var, :vabs2, [xop], elementbytes)
343+
return add_compute!(ls, var, :abs2, [xop], elementbytes)
344344
end
345345

346346
# Implementation from https://github.com/JuliaLang/julia/blob/a965580ba7fd0e8314001521df254e30d686afbf/base/intfuncs.jl#L216
347347
t = trailing_zeros(pint) + 1
348348
pint >>= t
349349
while (t -= 1) > 0
350350
varname = (iszero(pint) && isone(t)) ? var : gensym(:pbs)
351-
xop = add_compute!(ls, varname, :vabs2, [xop], elementbytes)
351+
xop = add_compute!(ls, varname, :abs2, [xop], elementbytes)
352352
end
353353
yop = xop
354354
while pint > 0
355355
t = trailing_zeros(pint) + 1
356356
pint >>= t
357357
while (t -= 1) >= 0
358-
xop = add_compute!(ls, gensym(:pbs), :vabs2, [xop], elementbytes)
358+
xop = add_compute!(ls, gensym(:pbs), :abs2, [xop], elementbytes)
359359
end
360360
yop = add_compute!(ls, iszero(pint) ? var : gensym(:pbs), :vmul, [xop, yop], elementbytes)
361361
end

src/add_ifelse.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, positio
3535
else
3636
falseop = getop(ls, iffalse, elementbytes)
3737
end
38-
add_compute!(ls, LHS, :vifelse, [condop, trueop, falseop], elementbytes)
38+
add_compute!(ls, LHS, :ifelse, [condop, trueop, falseop], elementbytes)
3939
end
4040

4141
function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int, position::Int)
4242
if LHS isa Symbol
4343
altop = getop(ls, LHS, elementbytes)
44-
return add_compute!(ls, LHS, :vifelse, [condop, rhsop, altop], elementbytes)
44+
return add_compute!(ls, LHS, :ifelse, [condop, rhsop, altop], elementbytes)
4545
elseif LHS isa Expr && LHS.head === :ref
4646
return add_conditional_store!(ls, LHS, condop, rhsop, elementbytes)
4747
else
@@ -78,10 +78,10 @@ function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, ele
7878
negatedcondop = negateop!(ls, condop, elementbytes)
7979
if LHS isa Symbol
8080
altop = getop(ls, LHS, elementbytes)
81-
# return add_compute!(ls, LHS, :vifelse, [condop, altop, rhsop], elementbytes)
81+
# return add_compute!(ls, LHS, :ifelse, [condop, altop, rhsop], elementbytes)
8282
# Placing altop second seems to let LLVM fuse operations; but as of LLVM 9.0.1 it will not if altop is first
8383
# therefore, we negate the condition and switch order so that the altop is second.
84-
return add_compute!(ls, LHS, :vifelse, [negatedcondop, rhsop, altop], elementbytes)
84+
return add_compute!(ls, LHS, :ifelse, [negatedcondop, rhsop, altop], elementbytes)
8585
elseif LHS isa Expr && LHS.head === :ref
8686
# negatedcondop = add_compute!(ls, gensym(:negated_mask), :~, [condop], elementbytes)
8787
return add_conditional_store!(ls, LHS, negatedcondop, rhsop, elementbytes)

src/add_stores.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ function add_conditional_store!(ls::LoopSet, LHS, condop::Operation, storeop::Op
113113
# prevstore = getop(ls, id + 1)
114114
# # @show prevstore prevstore.node_type, loopdependencies(prevstore)
115115
# # @show operations(ls)
116-
# storeop = add_compute!(ls, gensym(:combinedstoreop), Instruction(:vifelse), [condop, storeop, first(parents(prevstore))], elementbytes)
116+
# storeop = add_compute!(ls, gensym(:combinedstoreop), Instruction(:ifelse), [condop, storeop, first(parents(prevstore))], elementbytes)
117117
# storeparents = [storeop]
118118
# storeinstr = if prevstore.instruction.instr === :conditionalstore!
119119
# push!(storeparents, add_compute!(ls, gensym(:combinedmask), Instruction(:|), [condop, last(parents(prevstore))], elementbytes))

0 commit comments

Comments
 (0)