Skip to content

Commit d9ef1e0

Browse files
authored
M1 (#185)
* Improve M1 support * apple silicon params * No `strict=true` for `makedocs` * Bump version * Tilesearch updates * update params
1 parent 8bfdecb commit d9ef1e0

File tree

6 files changed

+174
-108
lines changed

6 files changed

+174
-108
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
4-
version = "0.3.26"
4+
version = "0.3.27"
55

66
[deps]
77
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"

benchmark/tilesearch.jl

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,19 @@ function matmul_pack_ab!(
1414
K = size(B, 1)
1515
zc, za, zb = Octavian.zstridedpointer.((C, A, B))
1616
nspawn = VectorizationBase.num_cores()
17-
threads, torelease = Octavian.PolyesterWeave.__request_threads(
18-
(nspawn - 1) % UInt32,
19-
Octavian.PolyesterWeave.worker_pointer(),
20-
nothing
21-
)
17+
nthreads = min(Int(nspawn), Threads.nthreads())
18+
# threads, torelease = Octavian.PolyesterWeave.__request_threads(
19+
# (nspawn - 1) % UInt32,
20+
# Octavian.PolyesterWeave.worker_pointer(),
21+
# nothing
22+
# )
2223
t = Inf
2324
GC.@preserve C A B begin
2425
for _ 1:2
2526
t = min(
2627
t,
2728
@elapsed(
28-
Octavian.matmul_pack_A_and_B!(
29+
Octavian.__matmul!(
2930
zc,
3031
za,
3132
zb,
@@ -34,7 +35,7 @@ function matmul_pack_ab!(
3435
M,
3536
K,
3637
N,
37-
threads,
38+
nthreads,
3839
F64(W₁),
3940
F64(W₂),
4041
F64(R₁),
@@ -44,7 +45,7 @@ function matmul_pack_ab!(
4445
)
4546
end
4647
end
47-
Octavian.PolyesterWeave.free_threads!(torelease)
48+
# Octavian.PolyesterWeave.free_threads!(torelease)
4849
return t
4950
end
5051

@@ -134,24 +135,28 @@ function matrix_range(S, ::Type{T} = Float64) where {T}
134135
end
135136

136137
T = Float32
137-
min_size = round(
138-
Int,
139-
sqrt(
140-
(0.65 / 4) *
141-
Octavian.num_cores() *
142-
Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)
143-
)
138+
min_size = min(
139+
round(
140+
Int,
141+
sqrt(
142+
(0.65 / 4) * Octavian.num_cores() * Octavian.second_cache_size() /
143+
sizeof(T)
144+
)
145+
),
146+
2000
144147
)
145-
max_size = round(
146-
Int,
147-
sqrt(
148-
(32 / 4) *
149-
Octavian.num_cores() *
150-
Octavian.VectorizationBase.cache_size(Val(3)) / sizeof(T)
151-
)
148+
max_size = min(
149+
round(
150+
Int,
151+
sqrt(
152+
(32 / 4) * Octavian.num_cores() * Octavian.second_cache_size() /
153+
sizeof(T)
154+
)
155+
),
156+
10_000
152157
)
153158

154-
SR = size_range(max_size, min_size, 400);
159+
SR = size_range(max_size, min_size, 40);
155160
const CsConst, AsConst, BsConst = matrix_range(SR, T);
156161

157162
# using Hyperopt
@@ -213,7 +218,7 @@ init = Float64[
213218
Octavian.R₁Default(),
214219
Octavian.R₂Default()
215220
]
216-
lower = 0.75 .* init;
221+
lower = 0.25 .* init;
217222
# upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
218223
upper = [0.9, 1.25init[2], 0.999, 0.999];
219224
# init = [0.001, 0.9754033943603924, 0.5711159869399494, 0.7547361860432168];
@@ -222,5 +227,5 @@ opt = Optim.optimize(
222227
matmul_objective,
223228
init,
224229
ParticleSwarm(; lower = lower, upper = upper),
225-
Optim.Options(; iterations = 10^6, time_limit = 14 * hours)
230+
Optim.Options(; iterations = 10^6, time_limit = 8 * hours)
226231
);

docs/make.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ makedocs(;
1616
"Getting Started" => "getting-started.md",
1717
"Public API" => "public-api.md",
1818
"Internals (Private)" => "internals.md"
19-
],
19+
]
2020
)
2121

2222
deploydocs(; repo = "github.com/JuliaLinearAlgebra/Octavian.jl")

src/funcptrs.jl

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11

2-
struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end
3-
function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(
2+
struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd,W₁,W₂,R₁,R₂} <: Function end
3+
function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd,W₁,W₂,R₁,R₂})(
44
p::Ptr{UInt}
5-
) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
5+
) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd,W₁,W₂,R₁,R₂}
66
offset, C = load(p, TC, 2 * sizeof(UInt))
77
offset, A = load(p, TA, offset)
88
offset, B = load(p, TB, offset)
@@ -11,11 +11,25 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(
1111
offset, M = load(p, Md, offset)
1212
offset, K = load(p, Kd, offset)
1313
offset, N = load(p, Nd, offset)
14-
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
14+
_call_loopmul!(
15+
C,
16+
A,
17+
B,
18+
α,
19+
β,
20+
M,
21+
K,
22+
N,
23+
Val{P}(),
24+
static(W₁),
25+
static(W₂),
26+
static(R₁),
27+
static(R₂)
28+
)
1529
_atomic_store!(p, SPIN)
1630
nothing
1731
end
18-
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) =
32+
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}, W₁, W₂, R₁, R₂) =
1933
loopmul!(C, A, B, α, β, M, K, N)
2034
@inline function _call_loopmul!(
2135
C::StridedPointer{T},
@@ -26,31 +40,20 @@ end
2640
M,
2741
K,
2842
N,
29-
::Val{true}
43+
::Val{true},
44+
W₁,
45+
W₂,
46+
R₁,
47+
R₂
3048
) where {T}
31-
if M * K < ceil(Int, Float64(first_cache_size(Val(T)) * R₂Default()))
49+
if M * K < ceil(Int, Float64(first_cache_size(Val(T)) * R₂))
3250
packaloopmul!(C, A, B, α, β, M, K, N)
3351
return
3452
else
35-
matmul_st_only_pack_A!(
36-
C,
37-
A,
38-
B,
39-
α,
40-
β,
41-
M,
42-
K,
43-
N,
44-
W₁Default(),
45-
W₂Default(),
46-
R₁Default(),
47-
R₂Default()
48-
)
53+
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁, W₂, R₁, R₂)
4954
return
5055
end
5156
end
52-
call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} =
53-
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
5457

5558
struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end
5659
function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(
@@ -108,11 +111,15 @@ end
108111
M::Md,
109112
K::Kd,
110113
N::Nd,
111-
::Val{P}
112-
) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
114+
::Val{P},
115+
::StaticFloat64{W₁},
116+
::StaticFloat64{W₂},
117+
::StaticFloat64{R₁},
118+
::StaticFloat64{R₂}
119+
) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd,W₁,W₂,R₁,R₂}
113120
offset = store!(
114121
p,
115-
cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()),
122+
cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd,W₁,W₂,R₁,R₂}()),
116123
sizeof(UInt)
117124
)
118125
offset = store!(p, C, offset)
@@ -136,9 +143,13 @@ end
136143
K,
137144
N,
138145
tid::UInt32,
139-
::Val{P}
146+
::Val{P},
147+
W₁,
148+
W₂,
149+
R₁,
150+
R₂
140151
) where {P}
141-
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
152+
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}(), W₁, W₂, R₁, R₂)
142153
end
143154

144155
struct SyncMulLauncher{W₁,W₂,R₁,R₂} end

src/global_constants.jl

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -18,45 +18,46 @@ MᵣW_mul_factor(::True) = StaticInt{4}()
1818
MᵣW_mul_factor(::False) = StaticInt{9}()
1919
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
2020

21-
22-
if Sys.ARCH === :aarch64 && (Sys.isapple() || occursin("apple", Sys.CPU_NAME::String))
23-
W₁Default() = StaticFloat64{0.23015506935919203}()
24-
W₂Default() = StaticFloat64{0.16967706087713014}()
25-
R₁Default() = StaticFloat64{0.9982516031563079}()
26-
R₂Default() = StaticFloat64{0.5167030291302886}()
21+
if Sys.ARCH === :aarch64 &&
22+
(Sys.isapple() || occursin("apple", Sys.CPU_NAME::String))
23+
W₁Default() = StaticFloat64{0.1464967933382345}()
24+
W₂Default() = StaticFloat64{0.07243228432052636}()
25+
R₁Default() = StaticFloat64{0.5024723443788641}()
26+
R₂Default() = StaticFloat64{0.9018940596921994}()
2727
else
28-
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
29-
W₂Default(::True) = StaticFloat64{0.7757548987718677}()
30-
R₁Default(::True) = StaticFloat64{0.7936663315339363}()
31-
R₂Default(::True) = StaticFloat64{0.7144577794375783}()
32-
33-
W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
34-
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
35-
R₁Default_arch(::Val{:znver1}) = StaticFloat64{0.6077103834481342}()
36-
R₂Default_arch(::Val{:znver1}) = StaticFloat64{0.8775382433240162}()
37-
38-
W₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat64{0.1}()
39-
W₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
40-
StaticFloat64{0.993489411720157}()
41-
R₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
42-
StaticFloat64{0.6052218809954467}()
43-
R₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
44-
StaticFloat64{0.7594052633561165}()
45-
46-
W₁Default_arch(_) = StaticFloat64{0.05846951331683783}()
47-
W₂Default_arch(_) = StaticFloat64{0.16070447575367697}()
48-
R₁Default_arch(_) = StaticFloat64{0.5370318382263098}()
49-
R₂Default_arch(_) = StaticFloat64{0.5584398748982029}()
50-
51-
W₁Default(::False) = W₁Default_arch(VectorizationBase.cpu_name())
52-
W₂Default(::False) = W₂Default_arch(VectorizationBase.cpu_name())
53-
R₁Default(::False) = R₁Default_arch(VectorizationBase.cpu_name())
54-
R₂Default(::False) = R₂Default_arch(VectorizationBase.cpu_name())
55-
56-
W₁Default() = W₁Default(has_feature(Val(:x86_64_avx512f)))
57-
W₂Default() = W₂Default(has_feature(Val(:x86_64_avx512f)))
58-
R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
59-
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
28+
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
29+
W₂Default(::True) = StaticFloat64{0.7757548987718677}()
30+
R₁Default(::True) = StaticFloat64{0.7936663315339363}()
31+
R₂Default(::True) = StaticFloat64{0.7144577794375783}()
32+
33+
W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
34+
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
35+
R₁Default_arch(::Val{:znver1}) = StaticFloat64{0.6077103834481342}()
36+
R₂Default_arch(::Val{:znver1}) = StaticFloat64{0.8775382433240162}()
37+
38+
W₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat64{0.1}()
39+
W₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
40+
StaticFloat64{0.993489411720157}()
41+
R₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
42+
StaticFloat64{0.6052218809954467}()
43+
R₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) =
44+
StaticFloat64{0.7594052633561165}()
45+
46+
W₁Default_arch(_) = StaticFloat64{0.05846951331683783}()
47+
W₂Default_arch(_) = StaticFloat64{0.16070447575367697}()
48+
R₁Default_arch(_) = StaticFloat64{0.5370318382263098}()
49+
R₂Default_arch(_) = StaticFloat64{0.5584398748982029}()
50+
51+
W₁Default(::False) = W₁Default_arch(VectorizationBase.cpu_name())
52+
W₂Default(::False) = W₂Default_arch(VectorizationBase.cpu_name())
53+
R₁Default(::False) = R₁Default_arch(VectorizationBase.cpu_name())
54+
R₂Default(::False) = R₂Default_arch(VectorizationBase.cpu_name())
55+
56+
W₁Default() = W₁Default(has_feature(Val(:x86_64_avx512f)))
57+
W₂Default() = W₂Default(has_feature(Val(:x86_64_avx512f)))
58+
R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
59+
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
60+
6061
end
6162

6263
# @static if Sys.ARCH === :x86_64 || Sys.ARCH === :i686
@@ -77,7 +78,8 @@ first_cache_size() = _first_cache_size(cache_size(first_cache()))
7778

7879
_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
7980
_second_cache_size(scs::StaticInt, ::False) = scs
80-
@static if (Sys.isapple() || occursin("apple", Sys.CPU_NAME::String)) && Sys.ARCH === :aarch64
81+
@static if (Sys.isapple() || occursin("apple", Sys.CPU_NAME::String)) &&
82+
Sys.ARCH === :aarch64
8183
_second_cache_size(::StaticInt{0}, ::False) = StaticInt(100663296)
8284
else
8385
_second_cache_size(::StaticInt{0}, ::False) = StaticInt(3145728)

0 commit comments

Comments
 (0)