rip out cache_size

chriselrod · chriselrod · commit def5ad12f760 · 2022-08-31T13:38:47.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.124"
+version = "0.12.125"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -548,8 +548,8 @@ end
   # we have an N dimensional loop.
   # need to construct the LoopSet
   ls = LoopSet(Mod)
-  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
-  set_hw!(ls, rs, rc, cls, l1, l2, l3)
+  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL
+  set_hw!(ls, rs, rc, cls)
   ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
   loopsyms = [gensym!(ls, "n") for _ ∈ 1:N]
   add_broadcast_loops!(ls, loopsyms, :dest)
@@ -584,8 +584,8 @@ end
   # we have an N dimensional loop.
   # need to construct the LoopSet
   ls = LoopSet(Mod)
-  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
-  set_hw!(ls, rs, rc, cls, l1, l2, l3)
+  inline, u₁, u₂, v, isbroadcast, _, rs, rc, cls, threads, warncheckarg = UNROLL
+  set_hw!(ls, rs, rc, cls)
   ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
   loopsyms = [gensym!(ls, "n") for _ ∈ 1:N]
   pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
@@ -626,7 +626,7 @@ end
   ::Val{UNROLL},
   ::Val{dontbc}
 ) where {T<:NativeTypes,N,T2<:Number,Mod,UNROLL,dontbc}
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL
   quote
     $(Expr(:meta, :inline))
     arg = T(first(bc.args))
@@ -646,7 +646,7 @@ end
   ::Val{UNROLL},
   ::Val{dontbc}
 ) where {T<:NativeTypes,N,A<:AbstractArray{T,N},T2<:Number,Mod,UNROLL,dontbc}
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads = UNROLL
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, threads = UNROLL
   quote
     $(Expr(:meta, :inline))
     arg = T(first(bc.args))
diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl
@@ -420,7 +420,7 @@ function thread_one_loops_expr(
   valid_thread_loop::Vector{Bool},
   ntmax::UInt,
   c::Float64,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
   OPS::Expr,
   ARF::Expr,
   AM::Expr,
@@ -615,7 +615,7 @@ function thread_two_loops_expr(
   valid_thread_loop::Vector{Bool},
   ntmax::UInt,
   c::Float64,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
   OPS::Expr,
   ARF::Expr,
   AM::Expr,
@@ -877,7 +877,7 @@ function valid_thread_loops(ls::LoopSet)
 end
 function avx_threads_expr(
   ls::LoopSet,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
   nt::UInt,
   OPS::Expr,
   ARF::Expr,
diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl
@@ -107,8 +107,7 @@ function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
   # it shouldn't.
   # Current behavior is incorrect when VECWIDTH chosen does actually differ between
   # split loops and the loops are statically sized, because code gen will then assume it is correct...
-  l1, l2, l3 = cache_sze(ls)
-  set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
+  set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls))
   ls_new.vector_width = ls.vector_width
   fill_offset_memop_collection!(ls)
   # println("ls_new operations:")
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -550,28 +550,17 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
   preserve, shouldindbyind, roots
 end
 
-# first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
-# function _first_cache_size(::StaticInt{FCS}) where {FCS}
-#     L1inclusive = StaticInt{FCS}() - VectorizationBase.cache_size(One())
-#     ifelse(eq(first_cache(), StaticInt(2)) & VectorizationBase.cache_inclusive(StaticInt(2)), L1inclusive, StaticInt{FCS}())
-# end
-# _first_cache_size(::Nothing) = StaticInt(262144)
-# first_cache_size() = _first_cache_size(cache_size(first_cache()))
-
 @generated function _turbo_config_val(
   ::Val{CNFARG},
   ::StaticInt{W},
   ::StaticInt{RS},
   ::StaticInt{AR},
   ::StaticInt{NT},
   ::StaticInt{CLS},
-  ::StaticInt{L1},
-  ::StaticInt{L2},
-  ::StaticInt{L3},
-) where {CNFARG,W,RS,AR,CLS,L1,L2,L3,NT}
+) where {CNFARG,W,RS,AR,CLS,NT}
   inline, u₁, u₂, v, BROADCAST, thread = CNFARG
   nt = min(thread % UInt, NT % UInt)
-  t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, L1, L2, L3, nt)
+  t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, nt)
   length(CNFARG) == 7 && push!(t.args, CNFARG[7])
   Expr(:call, Expr(:curly, :Val, t))
 end
@@ -582,10 +571,7 @@ end
     register_size(),
     available_registers(),
     lv_max_num_threads(),
-    cache_linesize(),
-    cache_size(StaticInt(1)),
-    cache_size(StaticInt(2)),
-    cache_size(StaticInt(3)),
+    cache_linesize()
   )
 end
 function find_samename_constparent(op::Operation, opname::Symbol)
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
@@ -451,7 +451,6 @@ mutable struct LoopSet
   register_size::Int
   register_count::Int
   cache_linesize::Int
-  cache_size::Tuple{Int,Int,Int}
   ureduct::Int
   equalarraydims::Vector{Tuple{Vector{Symbol},Vector{Int}}}
   omop::OffsetLoadCollection
@@ -499,11 +498,11 @@ function save_tilecost!(ls::LoopSet)
   end
   # ls.reg_pres[5,1] = ls.reg_pres[5,2]
 end
-function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int, l1::Int, l2::Int, l3::Int)
+function set_hw!(ls::LoopSet, rs::Int, rc::Int, cls::Int)
   ls.register_size = rs
   ls.register_count = rc
   ls.cache_linesize = cls
-  ls.cache_size = (l1, l2, l3)
+  # ls.cache_size = (l1, l2, l3)
   # ls.opmask_register[] = omr
   nothing
 end
@@ -514,16 +513,12 @@ function set_hw!(ls::LoopSet)
     ls,
     Int(register_size()),
     Int(available_registers()),
-    Int(cache_linesize()),
-    Int(cache_size(StaticInt(1))),
-    Int(cache_size(StaticInt(2))),
-    Int(cache_size(StaticInt(3))),
+    Int(cache_linesize())
   )
 end
 reg_size(ls::LoopSet) = ls.register_size
 reg_count(ls::LoopSet) = ls.register_count
 cache_lnsze(ls::LoopSet) = ls.cache_linesize
-cache_sze(ls::LoopSet) = ls.cache_size
 
 pushprepreamble!(ls::LoopSet, ex) = push!(ls.prepreamble.args, ex)
 function pushpreamble!(ls::LoopSet, op::Operation, v::Symbol)
@@ -608,7 +603,6 @@ function LoopSet(mod::Symbol)
   ls.register_size = 0
   ls.register_count = 0
   ls.cache_linesize = 0
-  ls.cache_size = (0, 0, 0)
   ls.ureduct = -1
   ls.equalarraydims = Tuple{Vector{Symbol},Vector{Int}}[]
   ls.omop = OffsetLoadCollection()
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -874,9 +874,9 @@ function avx_loopset!(
 end
 function avx_body(
   ls::LoopSet,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
 )
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL
   q =
     (iszero(u₁) & iszero(v)) ? lower_and_split_loops(ls, inline % Int) :
     lower(ls, u₁ % Int, u₂ % Int, v % Int, inline % Int)
@@ -916,14 +916,14 @@ function _turbo_loopset(
   @nospecialize(LPSYMsv),
   LBsv::Core.SimpleVector,
   vargs::Core.SimpleVector,
-  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
+  UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,UInt},
 )
   nops = length(OPSsv) ÷ 3
   instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1]
   ops = OperationStruct[OPSsv[3i] for i ∈ 1:nops]
   ls = LoopSet(:LoopVectorization)
-  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
-  set_hw!(ls, rs, rc, cls, l1, l2, l3)
+  inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = UNROLL
+  set_hw!(ls, rs, rc, cls)
   ls.vector_width = W
   ls.isbroadcast = isbroadcast
   arsv = Vector{ArrayRefStruct}(undef, length(ARFsv))
@@ -990,11 +990,11 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
   post = hoist_constant_memory_accesses!(ls)
   # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
   q = if (last(var"#UNROLL#") > 1) && length(var"#LPSYM#") == length(ls.loops)
-    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = var"#UNROLL#"
+    inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt = var"#UNROLL#"
     # wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types
     avx_threads_expr(
       ls,
-      (inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, one(UInt)),
+      (inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, one(UInt)),
       nt,
       :(Val{$(var"#OPS#")}()),
       :(Val{$(var"#ARF#")}()),
diff --git a/src/user_api_conveniences.jl b/src/user_api_conveniences.jl
@@ -31,9 +31,6 @@ function matmul_params(
     rs,
     rc,
     cls,
-    Int(cache_size(StaticInt(1))),
-    Int(cache_size(StaticInt(2))),
-    Int(cache_size(StaticInt(3))),
   )
   if N ≢ nothing
     nloop = GEMMLOOPSET.loops[1]