diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a4363494..db42b7f7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,10 +30,6 @@ jobs: - windows-latest arch: - x64 - - x86 - exclude: - - os: macOS-latest - arch: x86 steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 diff --git a/.gitignore b/.gitignore index 0ee3d176..5e4280e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.jl.cov *.jl.*.cov *.jl.mem -Manifest.toml \ No newline at end of file +Manifest.toml +LocalPreferences.toml diff --git a/Project.toml b/Project.toml index 9fce9b01..0b0da808 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TensorOperations" uuid = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2" authors = ["Lukas Devos ", "Maarten Van Damme ", "Jutho Haegeman "] -version = "5.1.4" +version = "5.2.0" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" @@ -9,6 +9,8 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +Preferences = "21216c6a-2e73-6563-6e65-726566657250" PtrArrays = "43287f4e-b6f4-7ad1-bb20-aadabca52c3d" Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" @@ -38,10 +40,12 @@ LRUCache = "1" LinearAlgebra = "1.6" Logging = "1.6" PackageExtensionCompat = "1" +PrecompileTools = "1.1" +Preferences = "1.4" PtrArrays = "1.2" Random = "1" Strided = "2.2" -StridedViews = "0.3" +StridedViews = "0.3, 0.4" Test = "1" TupleTools = "1.6" VectorInterface = "0.4.1,0.5" diff --git a/docs/make.jl b/docs/make.jl index f6670bae..4285b7b3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -11,7 +11,8 @@ makedocs(; modules=[TensorOperations], "man/interface.md", "man/backends.md", "man/autodiff.md", - "man/implementation.md"], + "man/implementation.md", + "man/precompilation.md"], "Index" => "index/index.md"]) # Documenter can also automatically deploy documentation to gh-pages. diff --git a/docs/src/index.md b/docs/src/index.md index 099a51d9..ddb2a6db 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,7 +5,7 @@ ## Table of contents ```@contents -Pages = ["index.md", "man/indexnotation.md", "man/functions.md", "man/interface.md", "man/backends.md", "man/autodiff.md", "man/implementation.md"] +Pages = ["index.md", "man/indexnotation.md", "man/functions.md", "man/interface.md", "man/backends.md", "man/autodiff.md", "man/implementation.md", "man/precompilation.md"] Depth = 4 ``` diff --git a/docs/src/man/precompilation.md b/docs/src/man/precompilation.md new file mode 100644 index 00000000..64557d7c --- /dev/null +++ b/docs/src/man/precompilation.md @@ -0,0 +1,50 @@ +# Precompilation + +TensorOperations.jl has some support for precompiling commonly called functions. +The guiding philosophy is that often, tensor contractions are (part of) the bottlenecks of typical workflows, +and as such we want to maximize performance. As a result, we are choosing to specialize many functions which +may lead to a rather large time-to-first-execution (TTFX). In order to mitigate this, some of that work can +be moved to precompile-time, avoiding the need to re-compile these specializations for every fresh Julia session. + +Nevertheless, TensorOperations is designed to work with a large variety of input types, and simply enumerating +all of these tends to lead to prohibitively large precompilation times, as well as large system images. +Therefore, there is some customization possible to tweak the desired level of precompilation, trading in +faster precompile times for fast TTFX for a wider range of inputs. + +!!! compat "TensorOperations v5.2.0" + + Precompilation support requires at least TensorOperations v5.2.0. + +## Defaults + +By default, precompilation is enabled for "tensors" of type `Array{T,N}`, where `T` and `N` range over the following values: + +* `T` is either `Float64` or `ComplexF64` +* `tensoradd!` is precompiled up to `N = 5` +* `tensortrace!` is precompiled up to `4` free output indices and `2` pairs of traced indices +* `tensorcontract!` is precompiled up to `3` free output indices on both inputs, and `2` contracted indices + +## Custom settings + +The default precompilation settings can be tweaked to allow for more or less expansive coverage. This is achieved +through a combination of `PrecompileTools`- and `Preferences`-based functionality. + +To disable precompilation altogether, for example during development or when you prefer to have small binaries, +you can *locally* change the `"precompile_workload"` key in the preferences. + +```julia +using TensorOperations, Preferences +set_preferences!(TensorOperations, "precompile_workload" => false; force=true) +``` + +Alternatively, you can keep precompilation enabled, change the settings above through the same machinery, via: + +* `"precomple_eltypes"`: a `Vector{String}` that evaluate to the desired values of `T<:Number` +* `"precompile_add_ndims"`: an `Int` to specify the maximum `N` for `tensoradd!` +* `"precompile_trace_ndims"`: a `Vector{Int}` of length 2 to specify the maximal number of free and traced indices for `tensortrace!`. +* `"precompile_contract_ndims"`: a `Vector{Int}` of length 2 to specify the maximal number of free and contracted indices for `tensorcontract!`. + +!!! note "Backends" + + Currently, there is no support for precompiling methods that do not use the default backend. If this is a + feature you would find useful, feel free to contact us or open an issue. diff --git a/src/TensorOperations.jl b/src/TensorOperations.jl index 9a58c87d..fa5c0b59 100644 --- a/src/TensorOperations.jl +++ b/src/TensorOperations.jl @@ -77,4 +77,6 @@ function __init__() @require_extensions end +include("precompile.jl") + end # module diff --git a/src/implementation/blascontract.jl b/src/implementation/blascontract.jl index dbbf7420..f804fe5b 100644 --- a/src/implementation/blascontract.jl +++ b/src/implementation/blascontract.jl @@ -81,7 +81,7 @@ function _unsafe_blas_contract!(C::StridedView{T}, return C end -@inline function makeblascontractable(A, pA, TC, backend, allocator) +function makeblascontractable(A, pA, TC, backend, allocator) flagA = isblascontractable(A, pA) && eltype(A) == TC if !flagA A_ = tensoralloc_add(TC, A, pA, false, Val(true), allocator) diff --git a/src/implementation/functions.jl b/src/implementation/functions.jl index 7444b0bd..57f567be 100644 --- a/src/implementation/functions.jl +++ b/src/implementation/functions.jl @@ -79,7 +79,7 @@ See also [`tensorcopy`](@ref) and [`tensoradd!`](@ref) """ function tensorcopy!(C, A, pA::Index2Tuple, conjA::Bool=false, α::Number=One(), backend=DefaultBackend(), allocator=DefaultAllocator()) - return tensoradd!(C, A, pA, conjA, α, false, backend, allocator) + return tensoradd!(C, A, pA, conjA, α, Zero(), backend, allocator) end # ------------------------------------------------------------------------------------------ diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 00000000..1bfbf9a8 --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,111 @@ +using PrecompileTools: PrecompileTools +using Preferences: @load_preference + +# Validate preferences input +# -------------------------- +function validate_precompile_eltypes(eltypes) + eltypes isa Vector{String} || + throw(ArgumentError("`precompile_eltypes` should be a vector of strings, got $(typeof(eltypes)) instead")) + return map(eltypes) do Tstr + T = eval(Meta.parse(Tstr)) + (T isa DataType && T <: Number) || + error("Invalid precompile_eltypes entry: `$Tstr`") + return T + end +end + +function validate_add_ndims(add_ndims) + add_ndims isa Int || + throw(ArgumentError("`precompile_add_ndims` should be an `Int`, got `$add_ndims`")) + add_ndims ≥ 0 || error("Invalid precompile_add_ndims: `$add_ndims`") + return add_ndims +end + +function validate_trace_ndims(trace_ndims) + trace_ndims isa Vector{Int} && length(trace_ndims) == 2 || + throw(ArgumentError("`precompile_trace_ndims` should be a `Vector{Int}` of length 2, got `$trace_ndims`")) + all(≥(0), trace_ndims) || error("Invalid precompile_trace_ndims: `$trace_ndims`") + return trace_ndims +end + +function validate_contract_ndims(contract_ndims) + contract_ndims isa Vector{Int} && length(contract_ndims) == 2 || + throw(ArgumentError("`precompile_contract_ndims` should be a `Vector{Int}` of length 2, got `$contract_ndims`")) + all(≥(0), contract_ndims) || + error("Invalid precompile_contract_ndims: `$contract_ndims`") + return contract_ndims +end + +# Static preferences +# ------------------ +const PRECOMPILE_ELTYPES = validate_precompile_eltypes(@load_preference("precompile_eltypes", + ["Float64", + "ComplexF64"])) +const PRECOMPILE_ADD_NDIMS = validate_add_ndims(@load_preference("precompile_add_ndims", 5)) +const PRECOMPILE_TRACE_NDIMS = validate_trace_ndims(@load_preference("precompile_trace_ndims", + [4, 2])) +const PRECOMPILE_CONTRACT_NDIMS = validate_contract_ndims(@load_preference("precompile_contract_ndims", + [4, 2])) + +# Using explicit precompile statements here instead of @compile_workload: +# Actually running the precompilation through PrecompileTools leads to longer compile times +# Keeping the workload_enabled functionality to have the option of disabling precompilation +# in a compatible manner with the rest of the ecosystem +if PrecompileTools.workload_enabled(@__MODULE__) + # tensoradd! + # ---------- + for T in PRECOMPILE_ELTYPES + for N in 0:PRECOMPILE_ADD_NDIMS + C = Array{T,N} + A = Array{T,N} + pA = Index2Tuple{N,0} + + precompile(tensoradd!, (C, A, pA, Bool, One, Zero)) + precompile(tensoradd!, (C, A, pA, Bool, T, Zero)) + precompile(tensoradd!, (C, A, pA, Bool, T, T)) + + precompile(tensoralloc_add, (T, A, pA, Bool, Val{true})) + precompile(tensoralloc_add, (T, A, pA, Bool, Val{false})) + end + end + + # tensortrace! + # ------------ + for T in PRECOMPILE_ELTYPES + for N1 in 0:PRECOMPILE_TRACE_NDIMS[1], N2 in 0:PRECOMPILE_TRACE_NDIMS[2] + C = Array{T,N1} + A = Array{T,N1 + 2N2} + p = Index2Tuple{N1,0} + q = Index2Tuple{N2,N2} + + precompile(tensortrace!, (C, A, p, q, Bool, One, Zero)) + precompile(tensortrace!, (C, A, p, q, Bool, T, Zero)) + precompile(tensortrace!, (C, A, p, q, Bool, T, T)) + + # allocation re-uses tensoralloc_add + end + end + + # tensorcontract! + # --------------- + for T in PRECOMPILE_ELTYPES + for N1 in 0:PRECOMPILE_CONTRACT_NDIMS[1], N2 in 0:PRECOMPILE_CONTRACT_NDIMS[2], + N3 in 0:PRECOMPILE_CONTRACT_NDIMS[1] + + NA = N1 + N2 + NB = N2 + N3 + NC = N1 + N3 + C, A, B = Array{T,NC}, Array{T,NA}, Array{T,NB} + pA = Index2Tuple{N1,N2} + pB = Index2Tuple{N2,N3} + pAB = Index2Tuple{NC,0} + + precompile(tensorcontract!, (C, A, pA, Bool, B, pB, Bool, pAB, One, Zero)) + precompile(tensorcontract!, (C, A, pA, Bool, B, pB, Bool, pAB, T, Zero)) + precompile(tensorcontract!, (C, A, pA, Bool, B, pB, Bool, pAB, T, T)) + + precompile(tensoralloc_contract, (T, A, pA, Bool, B, pB, Bool, pAB, Val{true})) + precompile(tensoralloc_contract, (T, A, pA, Bool, B, pB, Bool, pAB, Val{false})) + end + end +end