Skip to content

Commit 2295262

Browse files
authored
Indexing changes and more precompiles (#254)
* CSE indices outside of _avx_! * rangesym is an empty symbol for static loops * Fix vcmpend bounds * No longer mark stores as noalias. * Update precompile statements, reduce some specialization, split deeply nested reductions to stop confusing LLVM with too many phi nodes, fix some reduction naming code. * Inline if there are constant compute ops to workaround #245 * esc fallback macro, remove Tullio workaround as it didn't work anyway, mangle parameters in `_avx_!` to make sure they don't clash with user's symbols from loops * More phi-splits for deep reductions, add energy(spin_conf) test (#244). * Add broutine2x2 to tests (fixes #239) * Fix typo. * Don't pass unnecessary arguments * latency * Print sizes in staticsizes bench to find out which are failing; cannot reproduce locally. * Require VectorizationBase 0.19.34 * Quantum tests on >=1.6 only (uses reinterpret(reshape,...))
1 parent 96aef96 commit 2295262

28 files changed

+1591
-742
lines changed

Project.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1313
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1414
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
1515
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
16+
StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
1617
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
1718
UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1819
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1920

2021
[compat]
21-
ArrayInterface = "3.1.7"
22+
ArrayInterface = "3.1.9"
2223
CheapThreads = "0.1.3,0.2"
2324
DocStringExtensions = "0.8"
2425
IfElse = "0.1"
@@ -28,7 +29,7 @@ SLEEFPirates = "0.6.14"
2829
Static = "0.2"
2930
ThreadingUtilities = "0.4.1"
3031
UnPack = "1"
31-
VectorizationBase = "0.19.30"
32+
VectorizationBase = "0.19.34"
3233
julia = "1.5"
3334

3435
[extras]

docs/src/examples/filtering.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ Here, we convolve a small matrix `kern` with a larger matrix `A`, storing the re
44
```julia
55
using LoopVectorization, OffsetArrays, Images
66
kern = Images.Kernel.gaussian((1, 1), (3, 3))
7+
A = rand(130,130);
8+
out = OffsetArray(similar(A, size(A) .- size(kernel) .+ 1), -1 .- kernel.offsets);
79
function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
810
@avx for J in CartesianIndices(out)
911
tmp = zero(eltype(out))

src/LoopVectorization.jl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ using VectorizationBase: register_size, register_count, cache_linesize, cache_si
99
maybestaticfirst, maybestaticlast, scalar_less, scalar_greaterequal, gep, gesp, NativeTypes, #llvmptr,
1010
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
1111
vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
12-
relu, stridedpointer, StridedPointer, StridedBitPointer, AbstractStridedPointer, _vload, _vstore!,
12+
relu, stridedpointer, stridedpointer_preserve, StridedPointer, StridedBitPointer, AbstractStridedPointer, _vload, _vstore!,
1313
reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
1414
reduced_all, reduced_any, reduce_to_all, reduce_to_any,
1515
vsum, vprod, vmaximum, vminimum, vany, vall, unwrap, Unroll, VecUnroll,
@@ -40,13 +40,14 @@ using SLEEFPirates: log_fast, log2_fast, log10_fast
4040

4141
using ArrayInterface
4242
using ArrayInterface: OptionallyStaticUnitRange, OptionallyStaticRange, Zero, One, StaticBool, True, False, reduce_tup, indices, UpTri, LoTri
43-
@static if VERSION v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
44-
using ArrayInterface: static_step
45-
else # Julia 1.5 did not define `step` on CartesianIndices
46-
@inline static_step(x) = ArrayInterface.static_step
47-
static_step(x::CartesianIndices) = CartesianIndex(map(step, x.indices))
48-
end
49-
const Static = ArrayInterface.StaticInt
43+
using StrideArraysCore: CloseOpen
44+
# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support
45+
# using ArrayInterface: static_step
46+
# else # Julia 1.5 did not define `step` on CartesianIndices
47+
@inline static_step(x) = ArrayInterface.static_step(x)
48+
@inline static_step(x::CartesianIndices) = VectorizationBase.CartesianVIndex(map(static_step, x.indices))
49+
# end
50+
const Static = StaticInt
5051

5152
using Requires
5253

@@ -62,7 +63,6 @@ const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = Symbol("##Wvecwidth##"), Sym
6263

6364
include("vectorizationbase_compat/contract_pass.jl")
6465
include("vectorizationbase_compat/subsetview.jl")
65-
include("closeopen.jl")
6666
include("getconstindexes.jl")
6767
include("predicates.jl")
6868
include("simdfunctionals/map.jl")

src/broadcast.jl

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ function add_broadcast!(
273273
end
274274
fulldims = Symbol[loopsyms[n] for n 1:N if ((Dlen < n) || D[n]::Bool)]
275275
ref = ArrayReference(bcname, fulldims)
276-
loadop = add_simple_load!(ls, destname, ref, elementbytes, true, true )::Operation
276+
loadop = add_simple_load!(ls, destname, ref, elementbytes, true )::Operation
277277
doaddref!(ls, loadop)
278278
end
279279
function add_broadcast_adjoint_array!(
@@ -283,7 +283,7 @@ function add_broadcast_adjoint_array!(
283283
# pushprepreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
284284
# isone(length(loopsyms)) && return extract_all_1_array!(ls, bcname, N, elementbytes)
285285
ref = ArrayReference(bcname, Symbol[loopsyms[N + 1 - n] for n 1:N])
286-
loadop = add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
286+
loadop = add_simple_load!( ls, destname, ref, elementbytes, true )::Operation
287287
doaddref!(ls, loadop)
288288
end
289289
function add_broadcast_adjoint_array!(
@@ -294,7 +294,7 @@ function add_broadcast_adjoint_array!(
294294
# pushprepreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
295295

296296
ref = ArrayReference(bcname, Symbol[loopsyms[2]])
297-
loadop = add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
297+
loadop = add_simple_load!( ls, destname, ref, elementbytes, true )::Operation
298298
doaddref!(ls, loadop)
299299
end
300300
function add_broadcast!(
@@ -313,7 +313,7 @@ function add_broadcast!(
313313
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
314314
::Type{<:AbstractArray{T,N}}, elementbytes::Int
315315
) where {T,N}
316-
loadop = add_simple_load!(ls, destname, ArrayReference(bcname, @view(loopsyms[1:N])), elementbytes, true, true)
316+
loadop = add_simple_load!(ls, destname, ArrayReference(bcname, @view(loopsyms[1:N])), elementbytes, true)
317317
doaddref!(ls, loadop)
318318
end
319319
function add_broadcast!(
@@ -335,7 +335,7 @@ function add_broadcast!(
335335
inds = Vector{Symbol}(undef, N+1)
336336
inds[1] = DISCONTIGUOUS
337337
inds[2:end] .= @view(loopsyms[1:N])
338-
loadop = add_simple_load!(ls, destname, ArrayReference(bcname, inds), elementbytes, true, true)
338+
loadop = add_simple_load!(ls, destname, ArrayReference(bcname, inds), elementbytes, true)
339339
doaddref!(ls, loadop)
340340
end
341341
const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = Broadcasted{S,Nothing,F,A}
@@ -408,8 +408,10 @@ end
408408
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
409409
doaddref!(ls, storeop)
410410
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
411-
# return ls
412-
Expr(:block, Expr(:meta,:inline), setup_call(ls, :(Base.Broadcast.materialize!(dest, bc)), LineNumberNode(0), inline, false, u₁, u₂, threads%Int), :dest)
411+
# return ls
412+
sc = setup_call(ls, :(Base.Broadcast.materialize!(dest, bc)), LineNumberNode(0), inline, false, u₁, u₂, threads%Int)
413+
# return sc
414+
Expr(:block, Expr(:meta,:inline), sc, :dest)
413415
end
414416
@generated function vmaterialize!(
415417
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC, ::Val{Mod}, ::Val{UNROLL}

src/closeopen.jl

Lines changed: 0 additions & 30 deletions
This file was deleted.

0 commit comments

Comments
 (0)