Doc updates, rename some internal functions.

chriselrod · chriselrod · commit f7ce86cf8d13 · 2021-05-26T08:52:01.000-04:00
diff --git a/docs/src/devdocs/constructing_loopsets.md b/docs/src/devdocs/constructing_loopsets.md
@@ -15,11 +15,11 @@ quote
     var"##vptr##_A" = LoopVectorization.stridedpointer(A)
     var"##vptr##_B" = LoopVectorization.stridedpointer(B)
     begin
-        $(Expr(:gc_preserve, :(LoopVectorization._avx_!(Val{(0, 0)}(), Tuple{:numericconstant, Symbol("##zero#270"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000007, LoopVectorization.memstore, 0x01, 0x02), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x03, 0x04), :numericconstant, Symbol("##reductzero#274"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000003, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x05), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x0000000000000132, 0x0000000000000003, 0x0000000000000000, 0x0000000000030405, LoopVectorization.compute, 0x00, 0x05), :LoopVectorization, :reduce_to_add, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000003, 0x0000000000000000, 0x0000000000000601, LoopVectorization.compute, 0x00, 0x01)}, Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000103, 0xffffffffffffffd6), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302, 0xffffffffffffe056), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffffd6)}, Tuple{0, Tuple{}, Tuple{}, Tuple{}, Tuple{}, Tuple{(1, LoopVectorization.IntOrFloat), (5, LoopVectorization.IntOrFloat)}, Tuple{}}, (LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K)), var"##vptr##_C", var"##vptr##_A", var"##vptr##_B", var"##vptr##_C")), :C, :A, :B))
+        $(Expr(:gc_preserve, :(LoopVectorization._turbo_!(Val{(0, 0)}(), Tuple{:numericconstant, Symbol("##zero#270"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000007, LoopVectorization.memstore, 0x01, 0x02), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x03, 0x04), :numericconstant, Symbol("##reductzero#274"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000003, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x05), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x0000000000000132, 0x0000000000000003, 0x0000000000000000, 0x0000000000030405, LoopVectorization.compute, 0x00, 0x05), :LoopVectorization, :reduce_to_add, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000003, 0x0000000000000000, 0x0000000000000601, LoopVectorization.compute, 0x00, 0x01)}, Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000103, 0xffffffffffffffd6), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302, 0xffffffffffffe056), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffffd6)}, Tuple{0, Tuple{}, Tuple{}, Tuple{}, Tuple{}, Tuple{(1, LoopVectorization.IntOrFloat), (5, LoopVectorization.IntOrFloat)}, Tuple{}}, (LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K)), var"##vptr##_C", var"##vptr##_A", var"##vptr##_B", var"##vptr##_C")), :C, :A, :B))
     end
 end
 ```
-When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_avx_!` get compiled. This causes the summary to be [reconstructed](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_avx_!`.
+When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_turbo_!` get compiled. This causes the summary to be [reconstructed](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_turbo_!`.
 
 The three chief components of the summaries are the definitions of operations, e.g.:
 ```julia
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -51,7 +51,7 @@ using Requires
 
 
 export LowDimArray, stridedpointer, indices,
-    @avx, @avxt, @turbo, @tturbo, *ˡ, _avx_!,
+    @avx, @avxt, @turbo, @tturbo, *ˡ, _turbo_!,
     vmap, vmap!, vmapt, vmapt!, vmapnt, vmapnt!, vmapntt, vmapntt!,
     tanh_fast, sigmoid_fast,
     vfilter, vfilter!, vmapreduce, vreduce
@@ -97,8 +97,9 @@ include("broadcast.jl")
 LoopVectorization provides macros and functions that combine SIMD vectorization and
 loop-reordering so as to improve performance:
 
-- [`@avx`](@ref): transform `for`-loops and broadcasting
-- [`@_avx`](@ref): similar to `@avx` but does not use type information
+- [`@turbo`](@ref): transform `for`-loops and broadcasting
+- [`vmapreduce`](@ref): vectorized version of `mapreduce`
+- [`vreduce`](@ref): vectorized version of `reduce`
 - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
 - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
 - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
diff --git a/src/codegen/loopstartstopmanager.jl b/src/codegen/loopstartstopmanager.jl
@@ -554,7 +554,7 @@ function use_loop_induct_var!(
   for (i,isli) ∈ enumerate(li)
     ii = i + offset
     ind = indices[ii]
-    Wisz && push!(gespinds.args, staticexpr(0)) # wrong for `@_avx`...
+    Wisz && push!(gespinds.args, staticexpr(0)) # wrong for `@_turbo`...
     if !li[i] # if it wasn't set
       uliv[i] = 0
       push!(offsetprecalc_descript.args, 0)
diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl
@@ -1,26 +1,26 @@
-struct AVX{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV} <: Function end
+struct TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV} <: Function end
 
-# This should call the same `_avx_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), _vargs)` as normal so that this
+# This should call the same `_turbo_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), _vargs)` as normal so that this
 # hopefully shouldn't add much to compile time.
 
-function (::AVX{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})(p::Ptr{UInt}) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
+function (::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV})(p::Ptr{UInt}) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
     (_, _vargs) = ThreadingUtilities.load(p, FLBV, 2*sizeof(UInt))
   # Main.VARGS[Threads.threadid()] = first(_vargs)
   # Threads.threadid() == 2 && Core.println(typeof(_vargs))
-    ret = _avx_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), Val{LBV}(), _vargs...)
+    ret = _turbo_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), Val{LBV}(), _vargs...)
     ThreadingUtilities.store!(p, ret, Int(register_size()))
     nothing
 end
-@generated function Base.pointer(::AVX{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
-    f = AVX{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}()
+@generated function Base.pointer(::TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV,FLBV<:Tuple{Vararg{Any,K}}}
+    f = TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FLBV}()
     precompile(f, (Ptr{UInt},))
     quote
         $(Expr(:meta,:inline))
         @cfunction($f, Cvoid, (Ptr{UInt},))
     end
 end
 
-@inline function setup_avx_threads!(p::Ptr{UInt}, fptr::Ptr{Cvoid}, args::LBV) where {K,LBV<:Tuple{Vararg{Any,K}}}
+@inline function setup_turbo_threads!(p::Ptr{UInt}, fptr::Ptr{Cvoid}, args::LBV) where {K,LBV<:Tuple{Vararg{Any,K}}}
     offset = ThreadingUtilities.store!(p, fptr, sizeof(UInt))
     offset = ThreadingUtilities.store!(p, args, offset)
     nothing
@@ -32,7 +32,7 @@ struct StaticType{T} end
 @inline function avx_launch(
     ::Val{UNROLL}, ::Val{OPS}, ::Val{ARF}, ::Val{AM}, ::Val{LPSYM}, ::StaticType{LBV}, fargs::FARGS, tid
 ) where {UNROLL,OPS,ARF,AM,LPSYM,K,LBV<:Tuple{Vararg{Any,K}},FARGS}
-    ThreadingUtilities.launch(setup_avx_threads!, tid, pointer(AVX{UNROLL,OPS,ARF,AM,LPSYM,LBV,FARGS}()), fargs)
+    ThreadingUtilities.launch(setup_turbo_threads!, tid, pointer(TURBO{UNROLL,OPS,ARF,AM,LPSYM,LBV,FARGS}()), fargs)
 end
 
 # function approx_cbrt(x)
@@ -367,10 +367,10 @@ function thread_one_loops_expr(
     end
   end
   avxcall_args = Expr(:tuple, lastboundexpr, Symbol("#vargs#"))
-  _avx_call_ = :(_avx_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), flatten_to_tuple(var"#avx#call#args#")...))
+  _turbo_call_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), flatten_to_tuple(var"#avx#call#args#")...))
   update_return_values = if length(ls.outer_reductions) > 0
     retv = loopset_return_value(ls, Val(false))
-    _avx_call_ = Expr(:(=), retv, _avx_call_)
+    _turbo_call_ = Expr(:(=), retv, _turbo_call_)
     outer_reduct_combine_expressions(ls, retv)
   else
     nothing
@@ -420,7 +420,7 @@ function thread_one_loops_expr(
       var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
     end
     var"#avx#call#args#" = $avxcall_args
-    $_avx_call_
+    $_turbo_call_
     var"##do#thread##" || $retexpr
     var"#thread#id#" = 0x00000000
     var"#thread#mask#" = Polyester.mask(var"#threads#")
@@ -516,12 +516,12 @@ function thread_two_loops_expr(
     end
   end
   avxcall_args = Expr(:tuple, lastboundexpr, Symbol("#vargs#"))
-  _avx_call_ = :(_avx_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), flatten_to_tuple(var"#avx#call#args#")...))
-  # _avx_orig_ = :(_avx_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, var"#lv#tuple#args#"))
+  _turbo_call_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), flatten_to_tuple(var"#avx#call#args#")...))
+  # _turbo_orig_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, var"#lv#tuple#args#"))
   update_return_values = if length(ls.outer_reductions) > 0
     retv = loopset_return_value(ls, Val(false))
-    _avx_call_ = Expr(:(=), retv, _avx_call_)
-    # _avx_orig_ = Expr(:(=), retv, _avx_orig_)
+    _turbo_call_ = Expr(:(=), retv, _turbo_call_)
+    # _turbo_orig_ = Expr(:(=), retv, _turbo_orig_)
     outer_reduct_combine_expressions(ls, retv)
   else
     nothing
@@ -535,7 +535,7 @@ function thread_two_loops_expr(
     $loopstart1
     $loopstart2
     # if var"#nthreads#" ≤ 1
-    #   $_avx_orig_
+    #   $_turbo_orig_
     #   return $retexpr
     # end
     $define_len1
@@ -614,7 +614,7 @@ function thread_two_loops_expr(
     end
     # @show $lastboundexpr
     var"#avx#call#args#" = $avxcall_args
-    $_avx_call_
+    $_turbo_call_
     var"##do#thread##" || $retexpr
     # @show $retv
     var"#thread#id#" = 0x00000000
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -527,7 +527,7 @@ end
 # _first_cache_size(::Nothing) = StaticInt(262144)
 # first_cache_size() = _first_cache_size(cache_size(first_cache()))
 
-@generated function _avx_config_val(
+@generated function _turbo_config_val(
     ::Val{CNFARG}, ::StaticInt{W}, ::StaticInt{RS}, ::StaticInt{AR}, ::StaticInt{NT},
     ::StaticInt{CLS}, ::StaticInt{L1}, ::StaticInt{L2}, ::StaticInt{L3}
 ) where {CNFARG,W,RS,AR,CLS,L1,L2,L3,NT}
@@ -539,7 +539,7 @@ end
 @inline function avx_config_val(
     ::Val{CNFARG}, ::StaticInt{W}
 ) where {CNFARG,W}
-    _avx_config_val(
+    _turbo_config_val(
         Val{CNFARG}(), StaticInt{W}(), register_size(), available_registers(), lv_max_num_threads(),
         cache_linesize(), cache_size(StaticInt(1)), cache_size(StaticInt(2)), cache_size(StaticInt(3))
     )
@@ -666,7 +666,7 @@ function generate_call_types(
   argmeta = argmeta_and_consts_description(ls, arraysymbolinds)
   loop_bounds = loop_boundaries(ls, shouldindbyind)
   loop_syms = tuple_expr(QuoteNode, ls.loopsymbols)
-  func = debug ? lv(:_avx_loopset_debug) : lv(:_avx_!)
+  func = debug ? lv(:_turbo_loopset_debug) : lv(:_turbo_!)
   lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
   configarg = (inline,u₁,u₂,ls.isbroadcast,thread)
   unroll_param_tup = Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), VECTORWIDTHSYMBOL)
diff --git a/src/constructors.jl b/src/constructors.jl
@@ -129,9 +129,9 @@ end
 Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant across iterations, to optimize the computation. For example:
 
     function AmulB!(C, A, B)
-        @turbo for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
+        @turbo for m ∈ indices((A,C), 1), n ∈ indices((B,C), 2) # indices((A,C),1) == axes(A,1) == axes(C,1)
             Cₘₙ = zero(eltype(C))
-            for k ∈ 1:size(A,2)
+            for k ∈ indices((A,B), (2,1)) # indices((A,B), (2,1)) == axes(A,2) == axes(B,1)
                 Cₘₙ += A[m,k] * B[k,n]
             end
             C[m,n] = Cₘₙ
@@ -163,7 +163,7 @@ true
 Advanced users can customize the implementation of the `@turbo`-annotated block
 using keyword arguments:
 
-```
+```julia
 @turbo inline=false unroll=2 body
 ```
 
@@ -175,9 +175,9 @@ It is clamped to be between `1` and `min(Threads.nthreads(),LoopVectorization.nu
 `false` is equivalent to `1`, and `true` is equivalent to `min(Threads.nthreads(),LoopVectorization.num_cores())`.
 
 `inline` is a Boolean. When `true`, `body` will be directly inlined
-into the function (via a forced-inlining call to `_avx_!`).
-When `false`, it wont force inlining of the call to `_avx_!` instead, letting Julia's own inlining engine
-determine whether the call to `_avx_!` should be inlined. (Typically, it won't.)
+into the function (via a forced-inlining call to `_turbo_!`).
+When `false`, it wont force inlining of the call to `_turbo_!` instead, letting Julia's own inlining engine
+determine whether the call to `_turbo_!` should be inlined. (Typically, it won't.)
 Sometimes not inlining can lead to substantially worse code generation, and >40% regressions, even in very
 large problems (2-d convolutions are a case where this has been observed).
 One can find some circumstances where `inline=true` is faster, and other circumstances
@@ -208,6 +208,8 @@ macro turbo(args...)
     turbo_macro(__module__, __source__, last(args), Base.front(args)...)
 end
 """
+    @tturbo
+
 Equivalent to `@turbo`, except it adds `thread=true` as the first keyword argument.
 Note that later arguments take precendence.
 
@@ -249,6 +251,11 @@ macro _turbo(arg, q)
   esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, -1)))
 end
 
+"""
+    @turbo_debug
+
+Returns a `LoopSet` object instead of evaluating the loops. Useful for debugging and introspection.
+"""
 macro turbo_debug(q)
     q = macroexpand(__module__, q)
     ls = LoopSet(q, __module__)
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
@@ -564,7 +564,7 @@ end
 
 """
 Used internally to create symbols unique for this loopset.
-This is used so that identical loops will create identical `_avx_!` calls in the macroexpansions, hopefully reducing recompilation.
+This is used so that identical loops will create identical `_turbo_!` calls in the macroexpansions, hopefully reducing recompilation.
 """
 gensym!(ls::LoopSet, s) = Symbol("###$(s)###$(ls.symcounter += 1)###")
 
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -1,14 +1,14 @@
 function _precompile_()
     ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
-    # Base.precompile(Tuple{typeof(which(_avx_!,(Val{UNROLL},Val{OPS},Val{ARF},Val{AM},Val{LPSYM},Tuple{LB, V},)).generator.gen),Any,Any,Any,Any,Any,Any,Any,Any,Type,Type,Type,Type,Any,Any})   # time: 1.0198073
+    # Base.precompile(Tuple{typeof(which(_turbo_!,(Val{UNROLL},Val{OPS},Val{ARF},Val{AM},Val{LPSYM},Tuple{LB, V},)).generator.gen),Any,Any,Any,Any,Any,Any,Any,Any,Type,Type,Type,Type,Any,Any})   # time: 1.0198073
     # Base.precompile(Tuple{typeof(gespf1),Any,Tuple{Any, VectorizationBase.NullStep}})   # time: 0.1096832
     Base.precompile(Tuple{typeof(turbo_macro),Module,LineNumberNode,Expr})   # time: 0.09183489
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 1, 1, 0, (1,), Tuple{StaticInt{8}}, Tuple{StaticInt{1}}},Tuple{StaticInt{1}}})   # time: 0.05469272
     Base.precompile(Tuple{typeof(zerorangestart),UnitRange{Int}})   # time: 0.04291692
     Base.precompile(Tuple{Type{LoopSet},Symbol})   # time: 0.03362425
     Base.precompile(Tuple{typeof(recursive_muladd_search!),Expr,Vector{Any},Nothing,Bool,Bool})   # time: 0.029960306
     Base.precompile(Tuple{typeof(add_constant!),LoopSet,Float64,Vector{Symbol},Symbol,Int})   # time: 0.027501073
-    Base.precompile(Tuple{typeof(_avx_loopset),Any,Any,Any,Any,Core.SimpleVector,Core.SimpleVector,Tuple{Bool, Int8, Int8, Bool, Int, Int, Int, Int, Int, Int, Int, UInt}})   # time: 0.02345788
+    Base.precompile(Tuple{typeof(_turbo_loopset),Any,Any,Any,Any,Core.SimpleVector,Core.SimpleVector,Tuple{Bool, Int8, Int8, Bool, Int, Int, Int, Int, Int, Int, Int, UInt}})   # time: 0.02345788
     Base.precompile(Tuple{typeof(substitute_broadcast),Expr,Symbol,Bool,Int8,Int8,Int})   # time: 0.02281322
     Base.precompile(Tuple{typeof(push!),LoopSet,Expr,Int,Int})   # time: 0.022659862
     Base.precompile(Tuple{typeof(add_compute!),LoopSet,Symbol,Expr,Int,Int,Nothing})   # time: 0.02167476
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
diff --git a/utils/generate_precompiles.jl b/utils/generate_precompiles.jl