add experimental multithreading via openmp

MikaelSlevinsky · MikaelSlevinsky · commit 16e0b1801e4e · 2018-02-18T14:49:46.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 docs/build/
-docs/site/
+docs/site/
+*/*.dylib
diff --git a/deps/Makefile b/deps/Makefile
@@ -0,0 +1,30 @@
+ifeq ($(OS), Windows_NT)
+	SLIB = dll
+else
+	UNAME := $(shell uname)
+	ifeq ($(UNAME), Darwin)
+		SLIB = dylib
+	else
+		SLIB = so
+	endif
+endif
+
+OBJ = Rotations.c
+CFLAGS = -std=c11 -Ofast -march=native
+LIBFLAGS = -shared -fPIC -lm -lgomp
+
+all:
+	make seriallib
+	make parallellib
+
+serial:
+	gcc-4.9 $(CFLAGS) $(OBJ) -o rot
+
+parallel:
+	gcc-4.9 -fopenmp $(CFLAGS) $(OBJ) -o rotpar
+
+seriallib:
+	gcc-4.9 $(CFLAGS) $(LIBFLAGS) $(OBJ) -o rot.$(SLIB)
+
+parallellib:
+	gcc-4.9 -fopenmp $(CFLAGS) $(LIBFLAGS) $(OBJ) -o rotpar.$(SLIB)
diff --git a/deps/Rotations.c b/deps/Rotations.c
@@ -0,0 +1,62 @@
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef _OPENMP
+    #include <omp.h>
+#else
+    #define omp_get_threat_num() 0
+    #define omp_get_num_procs() 8
+#endif
+
+void julia_apply_givens(double * A, const double * snm, const double * cnm, const int N, const int M);
+void julia_apply_givens_t(double * A, const double * snm, const double * cnm, const int N, const int M);
+
+int main(void) {
+    return 0;
+}
+
+void julia_apply_givens(double * A, const double * snm, const double * cnm, const int N, const int M) {
+    #pragma omp parallel for schedule(dynamic)
+    for (int m = M/2; m > 1; m--) {
+        double s, c;
+        double a1, a2, a3, a4;
+        for (int j = m; j > 1; j = j-2) {
+            for (int l = N-1-j; l >= 0; l--){
+                s = snm[l+(j-2)*(2*N+3-j)/2];
+                c = cnm[l+(j-2)*(2*N+3-j)/2];
+                a1 = A[l+N*(2*m-1)];
+                a2 = A[l+2+N*(2*m-1)];
+                a3 = A[l+N*(2*m)];
+                a4 = A[l+2+N*(2*m)];
+                A[l+N*(2*m-1)] = c*a1 + s*a2;
+                A[l+2+N*(2*m-1)] = c*a2 - s*a1;
+                A[l+N*(2*m)] = c*a3 + s*a4;
+                A[l+2+N*(2*m)] = c*a4 - s*a3;
+            }
+        }
+    }
+    return;
+}
+
+void julia_apply_givens_t(double * A, const double * snm, const double * cnm, const int N, const int M) {
+    #pragma omp parallel for schedule(dynamic)
+    for (int m = M/2; m > 1; m--) {
+        double s, c;
+        double a1, a2, a3, a4;
+        for (int j = 2+m%2; j <= m; j = j+2) {
+            for (int l = 0; l <= N-1-j; l++){
+                s = snm[l+(j-2)*(2*N+3-j)/2];
+                c = cnm[l+(j-2)*(2*N+3-j)/2];
+                a1 = A[l+N*(2*m-1)];
+                a2 = A[l+2+N*(2*m-1)];
+                a3 = A[l+N*(2*m)];
+                a4 = A[l+2+N*(2*m)];
+                A[l+N*(2*m-1)] = c*a1 - s*a2;
+                A[l+2+N*(2*m-1)] = c*a2 + s*a1;
+                A[l+N*(2*m)] = c*a3 - s*a4;
+                A[l+2+N*(2*m)] = c*a4 + s*a3;
+            }
+        }
+    }
+    return;
+}
diff --git a/deps/build.jl b/deps/build.jl
@@ -0,0 +1,4 @@
+p = pwd()
+cd(Pkg.dir("FastTransforms/deps/"))
+run(`make`)
+cd(p)
diff --git a/src/SphericalHarmonics/fastplan.jl b/src/SphericalHarmonics/fastplan.jl
@@ -8,7 +8,7 @@ struct FastSphericalHarmonicPlan{T} <: SphericalHarmonicPlan{T}
     B::Matrix{T}
 end
 
-function FastSphericalHarmonicPlan{T}(A::Matrix{T}, L::Int; opts...)
+function FastSphericalHarmonicPlan(A::Matrix{T}, L::Int; opts...) where T
     M, N = size(A)
     n = (N+1)÷2
     RP = RotationPlan(T, n-1)
diff --git a/src/SphericalHarmonics/slowplan.jl b/src/SphericalHarmonics/slowplan.jl
@@ -2,7 +2,7 @@ import Base.LinAlg: Givens, AbstractRotation
 
 ### These three A_mul_B! should be in Base, but for the time being they do not add methods to Base.A_mul_B!; they add methods to the internal A_mul_B!.
 
-function A_mul_B!{T<:Real}(G::Givens{T}, A::AbstractVecOrMat)
+function A_mul_B!(G::Givens{T}, A::AbstractVecOrMat) where T<:Real
     m, n = size(A, 1), size(A, 2)
     if G.i2 > m
         throw(DimensionMismatch("column indices for rotation are outside the matrix"))
@@ -28,7 +28,7 @@ function A_mul_B!(A::AbstractMatrix, G::Givens)
     return A
 end
 
-function A_mul_B!{T<:Real}(A::AbstractMatrix, G::Givens{T})
+function A_mul_B!(A::AbstractMatrix, G::Givens{T}) where T<:Real
     m, n = size(A, 1), size(A, 2)
     if G.i2 > n
         throw(DimensionMismatch("column indices for rotation are outside the matrix"))
@@ -45,7 +45,7 @@ struct Pnmp2toPlm{T} <: AbstractRotation{T}
     rotations::Vector{Givens{T}}
 end
 
-function Pnmp2toPlm{T}(::Type{T}, n::Int, m::Int)
+function Pnmp2toPlm(::Type{T}, n::Int, m::Int) where T
     G = Vector{Givens{T}}(n)
     @inbounds for ℓ = 1:n
         c = sqrt(T((2m+2)*(2ℓ+2m+3))/T((ℓ+2m+2)*(ℓ+2m+3)))
@@ -75,14 +75,42 @@ end
 
 struct RotationPlan{T} <: AbstractRotation{T}
     layers::Vector{Pnmp2toPlm{T}}
+    snm::Vector{T}
+    cnm::Vector{T}
 end
 
-function RotationPlan{T}(::Type{T}, n::Int)
+function RotationPlan(::Type{T}, n::Int) where T
     layers = Vector{Pnmp2toPlm{T}}(n-1)
     @inbounds for m = 0:n-2
         layers[m+1] = Pnmp2toPlm(T, n-1-m, m)
     end
-    RotationPlan(layers)
+    n = n+1
+    snm = zeros(T, (n*(n+1))÷2)
+    cnm = zeros(T, (n*(n+1))÷2)
+    @inbounds for l = 0:n-1
+        for m = 0:n-l-1
+            nums = T((l+1)*(l+2))
+            numc = T((2*m+2)*(2*l+2*m+5))
+            den = T((l+2*m+3)*(l+2*m+4))
+            snm[l+(m*(2*n+1-m))÷2+1] = sqrt(nums/den)
+            cnm[l+(m*(2*n+1-m))÷2+1] = sqrt(numc/den)
+        end
+    end
+    RotationPlan(layers, snm, cnm)
+end
+
+const rotpath = joinpath(Pkg.dir("FastTransforms"), "deps", "rotpar")
+
+function Base.A_mul_B!(P::RotationPlan{Float64}, A::AbstractMatrix{Float64})
+    M, N = size(A)
+    ccall((:julia_apply_givens, rotpath), Void, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Int64, Int64), A, P.snm, P.cnm, M, N)
+    A
+end
+
+function Base.At_mul_B!(P::RotationPlan{Float64}, A::AbstractMatrix{Float64})
+    M, N = size(A)
+    ccall((:julia_apply_givens_t, rotpath), Void, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Int64, Int64), A, P.snm, P.cnm, M, N)
+    A
 end
 
 function Base.A_mul_B!(P::RotationPlan, A::AbstractMatrix)
@@ -135,7 +163,7 @@ struct SlowSphericalHarmonicPlan{T} <: SphericalHarmonicPlan{T}
     B::Matrix{T}
 end
 
-function SlowSphericalHarmonicPlan{T}(A::Matrix{T})
+function SlowSphericalHarmonicPlan(A::Matrix{T}) where T
     M, N = size(A)
     n = (N+1)÷2
     RP = RotationPlan(T, n-1)
diff --git a/src/SphericalHarmonics/sphfunctions.jl b/src/SphericalHarmonics/sphfunctions.jl
@@ -10,7 +10,7 @@ function sph_zero_spurious_modes!(A::AbstractMatrix)
     A
 end
 
-function sphrand{T}(::Type{T}, m::Int, n::Int)
+function sphrand(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, 2n-1)
     for i = 1:m
         A[i,1] = rand(T)
@@ -24,7 +24,7 @@ function sphrand{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-function sphrandn{T}(::Type{T}, m::Int, n::Int)
+function sphrandn(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, 2n-1)
     for i = 1:m
         A[i,1] = randn(T)
@@ -38,7 +38,7 @@ function sphrandn{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-function sphones{T}(::Type{T}, m::Int, n::Int)
+function sphones(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, 2n-1)
     for i = 1:m
         A[i,1] = one(T)
@@ -52,7 +52,7 @@ function sphones{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-sphzeros{T}(::Type{T}, m::Int, n::Int) = zeros(T, m, 2n-1)
+sphzeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, 2n-1)
 
 function normalizecolumns!(A::AbstractMatrix)
     m, n = size(A)
diff --git a/src/SphericalHarmonics/synthesisanalysis.jl b/src/SphericalHarmonics/synthesisanalysis.jl
@@ -5,7 +5,7 @@ struct SynthesisPlan{T, P1, P2}
     temp::Vector{T}
 end
 
-function plan_synthesis{T<:fftwNumber}(A::Matrix{T})
+function plan_synthesis(A::Matrix{T}) where T<:fftwNumber
     m, n = size(A)
     x = FFTW.FakeArray(T, m)
     y = FFTW.FakeArray(T, n)
@@ -22,7 +22,7 @@ struct AnalysisPlan{T, P1, P2}
     temp::Vector{T}
 end
 
-function plan_analysis{T<:fftwNumber}(A::Matrix{T})
+function plan_analysis(A::Matrix{T}) where T<:fftwNumber
     m, n = size(A)
     x = FFTW.FakeArray(T, m)
     y = FFTW.FakeArray(T, n)
@@ -32,7 +32,7 @@ function plan_analysis{T<:fftwNumber}(A::Matrix{T})
     AnalysisPlan(planθ, planφ, C, zeros(T, n))
 end
 
-function Base.A_mul_B!{T}(Y::Matrix{T}, P::SynthesisPlan{T}, X::Matrix{T})
+function Base.A_mul_B!(Y::Matrix{T}, P::SynthesisPlan{T}, X::Matrix{T}) where T
     M, N = size(X)
 
     # Column synthesis
@@ -73,7 +73,7 @@ function Base.A_mul_B!{T}(Y::Matrix{T}, P::SynthesisPlan{T}, X::Matrix{T})
     Y
 end
 
-function Base.A_mul_B!{T}(Y::Matrix{T}, P::AnalysisPlan{T}, X::Matrix{T})
+function Base.A_mul_B!(Y::Matrix{T}, P::AnalysisPlan{T}, X::Matrix{T}) where T
     M, N = size(X)
 
     # Row analysis
@@ -112,7 +112,7 @@ end
 
 
 
-function row_analysis!{T}(P, C, vals::Vector{T})
+function row_analysis!(P, C, vals::Vector{T}) where T
     n = length(vals)
     cfs = scale!(two(T)/n,P*vals)
     cfs[1] *= half(T)
@@ -123,7 +123,7 @@ function row_analysis!{T}(P, C, vals::Vector{T})
     negateeven!(reverseeven!(A_mul_B!(C, cfs)))
 end
 
-function row_synthesis!{T}(P, C, cfs::Vector{T})
+function row_synthesis!(P, C, cfs::Vector{T}) where T
     n = length(cfs)
     Ac_mul_B!(C, reverseeven!(negateeven!(cfs)))
     if iseven(n)
@@ -171,17 +171,17 @@ function negateeven!(x::Vector)
     x
 end
 
-function A_mul_B_col_J!{T}(Y::Matrix{T}, P::r2rFFTWPlan{T}, X::Matrix{T}, J::Int)
+function A_mul_B_col_J!(Y::Matrix{T}, P::r2rFFTWPlan{T}, X::Matrix{T}, J::Int) where T
     unsafe_execute_col_J!(P, X, Y, J)
     return Y
 end
 
-function unsafe_execute_col_J!{T<:fftwDouble}(plan::r2rFFTWPlan{T}, X::Matrix{T}, Y::Matrix{T}, J::Int)
+function unsafe_execute_col_J!(plan::r2rFFTWPlan{T}, X::Matrix{T}, Y::Matrix{T}, J::Int) where T<:fftwDouble
     M = size(X, 1)
     ccall((:fftw_execute_r2r, libfftw), Void, (PlanPtr, Ptr{T}, Ptr{T}), plan, pointer(X, M*(J-1)+1), pointer(Y, M*(J-1)+1))
 end
 
-function unsafe_execute_col_J!{T<:fftwSingle}(plan::r2rFFTWPlan{T}, X::Matrix{T}, Y::Matrix{T}, J::Int)
+function unsafe_execute_col_J!(plan::r2rFFTWPlan{T}, X::Matrix{T}, Y::Matrix{T}, J::Int) where T<:fftwSingle
     M = size(X, 1)
     ccall((:fftwf_execute_r2r, libfftwf), Void, (PlanPtr, Ptr{T}, Ptr{T}), plan, pointer(X, M*(J-1)+1), pointer(Y, M*(J-1)+1))
 end
diff --git a/src/SphericalHarmonics/thinplan.jl b/src/SphericalHarmonics/thinplan.jl
@@ -12,7 +12,7 @@ struct ThinSphericalHarmonicPlan{T} <: SphericalHarmonicPlan{T}
     B::Matrix{T}
 end
 
-function ThinSphericalHarmonicPlan{T}(A::Matrix{T}, L::Int; opts...)
+function ThinSphericalHarmonicPlan(A::Matrix{T}, L::Int; opts...) where T
     M, N = size(A)
     n = (N+1)÷2
     RP = RotationPlan(T, n-1)
diff --git a/src/TriangularHarmonics/slowplan.jl b/src/TriangularHarmonics/slowplan.jl
@@ -1,8 +1,8 @@
-immutable Pnmp1toPlm{T} <: AbstractRotation{T}
+struct Pnmp1toPlm{T} <: AbstractRotation{T}
     rotations::Vector{Givens{T}}
 end
 
-function Pnmp1toPlm{T}(::Type{T}, n::Int, m::Int, α::T, β::T, γ::T)
+function Pnmp1toPlm(::Type{T}, n::Int, m::Int, α::T, β::T, γ::T) where T
     G = Vector{Givens{T}}(n)
     @inbounds for ℓ = 1:n
         c = sqrt((2m+β+γ+2)/(ℓ+2m+β+γ+2)*(2ℓ+2m+α+β+γ+2)/(ℓ+2m+α+β+γ+2))
@@ -29,11 +29,11 @@ function Base.A_mul_B!(A::AbstractMatrix, C::Pnmp1toPlm)
     A
 end
 
-immutable TriRotationPlan{T} <: AbstractRotation{T}
+struct TriRotationPlan{T} <: AbstractRotation{T}
     layers::Vector{Pnmp1toPlm{T}}
 end
 
-function TriRotationPlan{T}(::Type{T}, n::Int, α::T, β::T, γ::T)
+function TriRotationPlan(::Type{T}, n::Int, α::T, β::T, γ::T) where T
     layers = Vector{Pnmp1toPlm{T}}(n)
     @inbounds for m = 0:n-1
         layers[m+1] = Pnmp1toPlm(T, n-m, m, α, β, γ)
@@ -76,14 +76,14 @@ end
 Base.Ac_mul_B!(P::TriRotationPlan, A::AbstractMatrix) = At_mul_B!(P, A)
 
 
-immutable SlowTriangularHarmonicPlan{T} <: TriangularHarmonicPlan{T}
+struct SlowTriangularHarmonicPlan{T} <: TriangularHarmonicPlan{T}
     RP::TriRotationPlan{T}
     p::NormalizedLegendreToChebyshevPlan{T}
     pinv::ChebyshevToNormalizedLegendrePlan{T}
     B::Matrix{T}
 end
 
-function SlowTriangularHarmonicPlan{T}(A::Matrix{T}, α, β, γ)
+function SlowTriangularHarmonicPlan(A::Matrix{T}, α, β, γ) where T
     @assert β == γ == -half(T)
     @assert α == zero(T)
     M, N = size(A)
diff --git a/src/TriangularHarmonics/trifunctions.jl b/src/TriangularHarmonics/trifunctions.jl
@@ -8,7 +8,7 @@ function tri_zero_spurious_modes!(A::AbstractMatrix)
     A
 end
 
-function trirand{T}(::Type{T}, m::Int, n::Int)
+function trirand(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, n)
     for j = 1:n
         for i = 1:m+1-j
@@ -18,7 +18,7 @@ function trirand{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-function trirandn{T}(::Type{T}, m::Int, n::Int)
+function trirandn(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, n)
     for j = 1:n
         for i = 1:m+1-j
@@ -28,7 +28,7 @@ function trirandn{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-function triones{T}(::Type{T}, m::Int, n::Int)
+function triones(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, n)
     for j = 1:n
         for i = 1:m+1-j
@@ -38,7 +38,7 @@ function triones{T}(::Type{T}, m::Int, n::Int)
     A
 end
 
-trizeros{T}(::Type{T}, m::Int, n::Int) = zeros(T, m, n)
+trizeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)
 
 doc"""
 Pointwise evaluation of triangular harmonic: