Skip to content

Commit 2def383

Browse files
committed
Add more documentation on the internals
1 parent 6f299fa commit 2def383

File tree

11 files changed

+207
-9
lines changed

11 files changed

+207
-9
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ authors = ["Chris Elrod <[email protected]>"]
44
version = "0.7.0"
55

66
[deps]
7+
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
78
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
89
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
910
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
@@ -12,6 +13,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1213
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1314

1415
[compat]
16+
DocStringExtensions = "0.8"
1517
OffsetArrays = "1"
1618
SIMDPirates = "0.7.13"
1719
SLEEFPirates = "0.4.4"

docs/make.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ makedocs(;
2121
"devdocs/loopset_structure.md",
2222
"devdocs/constructing_loopsets.md",
2323
"devdocs/evaluating_loops.md",
24-
"devdocs/lowering.md"
24+
"devdocs/lowering.md",
25+
"devdocs/reference.md"
2526
]
2627
],
2728
# repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",

docs/src/api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ vmapntt!
2222

2323
```@docs
2424
vfilter
25-
vfilter!
25+
LoopVectorization.vfilter!
2626
```

docs/src/devdocs/reference.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Internals reference
2+
3+
## Operation types
4+
5+
```@docs
6+
LoopVectorization.OperationType
7+
LoopVectorization.constant
8+
LoopVectorization.memload
9+
LoopVectorization.compute
10+
LoopVectorization.memstore
11+
LoopVectorization.loopvalue
12+
```
13+
14+
## Operation
15+
16+
```@docs
17+
LoopVectorization.Operation
18+
```
19+
20+
## Instructions and costs
21+
22+
```@docs
23+
LoopVectorization.Instruction
24+
LoopVectorization.InstructionCost
25+
```
26+
27+
## Array references
28+
29+
```@docs
30+
LoopVectorization.ArrayReference
31+
LoopVectorization.ArrayReferenceMeta
32+
```
33+
34+
## Condensed types
35+
36+
These are used when encoding the `@avx` block as a type parameter for passing through
37+
to the `@generated` function.
38+
39+
```@docs
40+
LoopVectorization.ArrayRefStruct
41+
LoopVectorization.OperationStruct
42+
```

src/LoopVectorization.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ using SLEEFPirates: pow
1616
using Base.Broadcast: Broadcasted, DefaultArrayStyle
1717
using LinearAlgebra: Adjoint, Transpose
1818
using Base.Meta: isexpr
19+
using DocStringExtensions
1920

2021

2122
const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer}

src/condense_loopset.jl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it)
55
Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it)
66

77
"""
8-
`ArrayRefStruct` stores a representation of an array-reference expression such as `A[i,j]`.
8+
ArrayRefStruct
9+
10+
A condensed representation of an [`ArrayReference`](@ref).
911
It supports array-references with up to 8 indexes, where the data for each consecutive index is packed into corresponding 8-bit fields
1012
of `index_types` (storing the enum `IndexType`), `indices` (the `id` for each index symbol), and `offsets` (currently unused).
1113
"""
@@ -53,6 +55,11 @@ function ArrayRefStruct(ls::LoopSet, mref::ArrayReferenceMeta, arraysymbolinds::
5355
ArrayRefStruct{mref.ref.array,mref.ptr}( index_types, indices, offsets )
5456
end
5557

58+
"""
59+
OperationStruct
60+
61+
A condensed representation of an [`Operation`](@ref).
62+
"""
5663
struct OperationStruct <: AbstractLoopOperation
5764
# instruction::Instruction
5865
loopdeps::UInt64

src/constructors.jl

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,29 @@ julia> b ≈ c
9090
true
9191
```
9292
93+
# Extended help
94+
95+
Advanced users can customize the implementation of the `@avx`-annotated block
96+
using keyword arguments:
97+
98+
```
99+
@avx inline=false unroll=2 body
100+
```
101+
102+
where `body` is the code of the block (e.g., `for ... end`).
103+
104+
`inline` is a Boolean. When `true` (the default), `body` will be directly inlined
105+
into the function (via a forced-inlining call to `_avx_!`).
106+
When `false`, it will call `__avx__!` instead, letting Julia's own inlining engine
107+
determine whether the call to `__avx__!` should be inlined. (Typically, it won't.)
108+
In priniciple, first calling `__avx__!` (which itself calls `_avx_!`) can sometimes
109+
allow better code generation.
110+
One can find some circumstances where `inline=true` is faster, and other circumstances
111+
where `inline=false` is faster, so the best setting may require experimentation.
112+
113+
`unroll` is an integer that specifies the loop unrolling factor, or a
114+
tuple `(4, 2)` signaling that the generated code should unroll more than
115+
one loop.
93116
"""
94117
macro avx(q)
95118
q = macroexpand(__module__, q)
@@ -121,7 +144,7 @@ function check_unroll(arg)
121144
T = convert(Int8, tup.args[2])
122145
else
123146
return nothing
124-
end
147+
end
125148
else
126149
return nothing
127150
end
@@ -179,4 +202,3 @@ macro avx_debug(q)
179202
q = macroexpand(__module__, q)
180203
esc(LoopVectorization.setup_call_debug(LoopSet(q, __module__)))
181204
end
182-

src/costs.jl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ else
66
end
77

88

9+
"""
10+
Instruction
911
12+
`Instruction` represents a function via its module and symbol. It is
13+
similar to a `GlobalRef` and may someday be replaced by `GlobalRef`.
14+
"""
1015
struct Instruction
1116
mod::Symbol
1217
instr::Symbol
@@ -36,10 +41,24 @@ Base.isequal(ins1::Instruction, ins2::Instruction) = (ins1.instr === ins2.instr)
3641

3742
const LOOPCONSTANT = Instruction(Symbol("LOOPCONSTANTINSTRUCTION"))
3843

44+
"""
45+
InstructionCost
46+
47+
Store parameters related to performance for individual CPU instructions.
48+
49+
$(TYPEDFIELDS)
50+
"""
3951
struct InstructionCost
52+
"A flag indicating how instruction cost scales with vector width (128, 256, or 512 bits)"
4053
scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprocal throughput
54+
"""The number of clock cycles per operation when many of the same operation are repeated in sequence.
55+
Think of it as the inverse of the flow rate at steady-state. It is typically ≤ the `scalar_latency`."""
4156
scalar_reciprocal_throughput::Float64
57+
"""The minimum delay, in clock cycles, associated with the instruction.
58+
Think of it as the delay from turning on a faucet to when water starts coming out the end of the pipe.
59+
See also `scalar_reciprocal_throughput`."""
4260
scalar_latency::Int
61+
"Number of floating-point registered used"
4362
register_pressure::Int
4463
end
4564
InstructionCost(sl::Int, srt::Float64, scaling::Float64 = -3.0) = InstructionCost(scaling, srt, sl, 0)

src/operations.jl

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
1+
"""
2+
ArrayReference
3+
4+
A type for encoding an array reference `A[i,j]` occurring inside an `@avx` block.
5+
6+
# Fields
17
8+
$(TYPEDFIELDS)
9+
"""
210
struct ArrayReference
11+
"The array variable"
312
array::Symbol
13+
"The list of indices (e.g., `[:i, :j]`), or `name(op)` for computed indices."
414
indices::Vector{Symbol}
15+
"""Index offset, e.g., `a[i+7]` would store the `7`. `offsets` is also used
16+
to help identify opportunities for avoiding reloads, for example in `y[i] = x[i] - x[i-1]`,
17+
the previous load `x[i-1]` can be "carried over" to the next iteration.
18+
Only used for small (`Int8`) offsets."""
519
offsets::Vector{Int8}
620
end
721
ArrayReference(array, indices) = ArrayReference(array, indices, zeros(Int8, length(indices)))
@@ -26,9 +40,21 @@ function Base.isequal(x::ArrayReference, y::ArrayReference)
2640
end
2741
true
2842
end
43+
"""
44+
ArrayReferenceMeta
45+
46+
A type similar to [`ArrayReference`](@ref) but holding additional information.
47+
48+
# Fields
49+
50+
$(TYPEDFIELDS)
51+
"""
2952
struct ArrayReferenceMeta
53+
"The `ArrayReference`"
3054
ref::ArrayReference
55+
"A vector of Bools indicating whether each index is a loop variable (`false` for operation-computed indices)"
3156
loopedindex::Vector{Bool}
57+
"Variable holding the pointer to the array's underlying storage"
3258
ptr::Symbol
3359
end
3460
function ArrayReferenceMeta(ref::ArrayReference, loopedindex, ptr = vptr(ref))
@@ -56,29 +82,89 @@ Base.:(==)(x::ArrayReferenceMeta, y) = false
5682

5783
abstract type AbstractLoopOperation end
5884

85+
"""
86+
`OperationType` is an `@enum` for classifying supported operations that can appear in
87+
`@avx` blocks. Type `LoopVectorization.OperationType` to see the different types.
88+
"""
5989
@enum OperationType begin
6090
constant
6191
memload
6292
compute
6393
memstore
6494
loopvalue
6595
end
96+
"An operation setting a variable to a constant value (e.g., `a = 0.0`)" constant
97+
"An operation setting a variable from a memory location (e.g., `a = A[i,j]`)" memload
98+
"An operation computing a new value from one or more variables (e.g., `a = b + c`)" compute
99+
"An operation storing a value to a memory location (e.g., `A[i,j] = a`)" memstore
100+
"""
101+
`loopvalue` indicates an loop variable (`i` in `for i in ...`). These are the "parents" of `compute`
102+
operations that involve the loop variables.
103+
"""
104+
loopvalue
66105

67106
# TODO: can some computations be cached in the operations?
68107
"""
108+
Operation
109+
110+
A structure to encode a particular action occuring inside an `@avx` block.
111+
112+
# Fields
113+
114+
$(TYPEDFIELDS)
115+
116+
# Example
117+
118+
```jldoctest Operation; filter = r"\\"##.*\\""
119+
julia> using LoopVectorization
120+
121+
julia> AmulBq = :(for m ∈ 1:M, n ∈ 1:N
122+
C[m,n] = zero(eltype(B))
123+
for k ∈ 1:K
124+
C[m,n] += A[m,k] * B[k,n]
125+
end
126+
end);
127+
128+
julia> lsAmulB = LoopVectorization.LoopSet(AmulBq);
129+
130+
julia> LoopVectorization.operations(lsAmulB)
131+
6-element Array{LoopVectorization.Operation,1}:
132+
var"##RHS#253" = var"##zero#254"
133+
C[m, n] = var"##RHS#253"
134+
var"##tempload#255" = A[m, k]
135+
var"##tempload#256" = B[k, n]
136+
var"##RHS#253" = LoopVectorization.vfmadd_fast(var"##tempload#255", var"##tempload#256", var"##RHS#253")
137+
var"##RHS#253" = LoopVectorization.identity(var"##RHS#253")
138+
```
139+
Each one of these lines is a pretty-printed `Operation`.
69140
"""
70141
mutable struct Operation <: AbstractLoopOperation
142+
"""A unique identifier for this operation.
143+
`identifer(op::Operation)` returns the index of this operation within `operations(ls::LoopSet)`."""
71144
identifier::Int
145+
"""The name of the variable storing the result of this operation.
146+
For `a = val` this would be `:a`. For array assignments `A[i,j] = val` this would be `:A`."""
72147
variable::Symbol
148+
"Intended to be the size of the result, in bytes. Often inaccurate, not to be relied on."
73149
elementbytes::Int
150+
"The specific operator, e.g., `identity` or `+`"
74151
instruction::Instruction
152+
"The [`OperationType`](@ref) associated with this operation"
75153
node_type::OperationType
154+
"The loop variables this operation depends on"
76155
dependencies::Vector{Symbol}
156+
"Additional loop dependencies that must execute before this operation can be performed successfully (often needed in reductions)"
77157
reduced_deps::Vector{Symbol}
158+
"Operations whose result this operation depends on"
78159
parents::Vector{Operation}
160+
"For `memload` or `memstore`, encodes the array location"
79161
ref::ArrayReferenceMeta
162+
"`gensymmed` name of result."
80163
mangledvariable::Symbol
164+
"""Loop variables that *consumers* of this operation depend on.
165+
Often used in reductions to replicate assignment of initializers when unrolling."""
81166
reduced_children::Vector{Symbol}
167+
82168
function Operation(
83169
identifier::Int,
84170
variable,
@@ -129,7 +215,7 @@ const NOPARENTS = Operation[]
129215
function Base.show(io::IO, op::Operation)
130216
if isconstant(op)
131217
if op.instruction === LOOPCONSTANT
132-
218+
133219
print(io, Expr(:(=), op.variable, 0))
134220
else
135221
print(io, Expr(:(=), op.variable, op.instruction.instr))

src/reconstruct_loopset.jl

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,8 +417,26 @@ function _avx_loopset(OPSsv, ARFsv, AMsv, LPSYMsv, LBsv, vargs)
417417
AMsv, LPSYMsv, LBsv, vargs
418418
)
419419
end
420+
const _body_ = Ref{Any}(nothing)
421+
"""
422+
_avx_!(ut, ops, arf, am, lpsym, lb, vargs...)
423+
424+
Execute an `@avx` block. The block's code is represented via the arguments:
425+
- `ut` is `Val((U,T))`, where `U` is the unrolling factor and `T` ?has something to do with tiling?
426+
- `ops` is `Tuple{mod1, sym1, op1, mod2, sym2, op2...}` encoding the operations of the loop.
427+
`mod` and `sym` encode the module and symbol of the called function; `op` is an [`OperationStruct`](@ref)
428+
encoding the details of the operation.
429+
- `arf` is `Tuple{arf1, arf2...}`, where each `arfi` is an [`ArrayRefStruct`](@ref) encoding
430+
an array reference.
431+
- `am` contains miscellaneous data about the LoopSet (see `process_metadata!`)
432+
- `lpsym` is `Tuple{:i,:j,...}`, a Tuple of the "loop symbols", i.e. the item variable `i` in `for i ∈ iter`
433+
- `lb` is `Tuple{RngTypei,RngTypej,...}`, a Tuple encoding syntactically-knowable information about
434+
the iterators corresponding to `lpsym`. For example, in `for i ∈ 1:n`, the `1:n` would be encoded with
435+
`StaticLowerUnitRange(1)` because the lower bound of the iterator can be determined to be 1.
436+
- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
437+
"""
420438
@generated function _avx_!(::Val{UT}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UT, OPS, ARF, AM, LPSYM, LB}
421439
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
422440
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
423-
avx_body(ls, UT)
441+
return _body_[] = copy(avx_body(ls, UT))
424442
end

0 commit comments

Comments
 (0)