JuliaApproximation
diff --git a/‎.gitignore
Lines changed: 0 additions & 1 deletion b/‎.gitignore
Lines changed: 0 additions & 1 deletion
diff --git a/‎deps/Makefile
Lines changed: 0 additions & 30 deletions b/‎deps/Makefile
Lines changed: 0 additions & 30 deletions
diff --git a/‎deps/Rotations.c
Lines changed: 0 additions & 62 deletions b/‎deps/Rotations.c
Lines changed: 0 additions & 62 deletions
diff --git a/‎deps/build.jl
Lines changed: 0 additions & 4 deletions b/‎deps/build.jl
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/FastTransforms.jl
Lines changed: 1 addition & 0 deletions b/‎src/FastTransforms.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/SphericalHarmonics/slowplan.jl
Lines changed: 42 additions & 8 deletions b/‎src/SphericalHarmonics/slowplan.jl
Lines changed: 42 additions & 8 deletions
diff --git a/‎src/stepthreading.jl
Lines changed: 70 additions & 0 deletions b/‎src/stepthreading.jl
Lines changed: 70 additions & 0 deletions
@@ -1,3 +1,2 @@
 docs/build/
 docs/site/
-*/*.dylib
@@ -55,6 +55,7 @@ export triones, trizeros, trirand, trirandn, trievaluate
 #export fejer2, fejer_plan2, fejerweights2
 #export RecurrencePlan, forward_recurrence!, backward_recurrence
 
+include("stepthreading.jl")
 include("fftBigFloat.jl")
 include("specialfunctions.jl")
 include("clenshawcurtis.jl")
 
@@ -99,20 +99,53 @@ function RotationPlan(::Type{T}, n::Int) where T
     RotationPlan(layers, snm, cnm)
 end
 
-const rotpath = joinpath(Pkg.dir("FastTransforms"), "deps", "rotpar")
-
-function Base.A_mul_B!(P::RotationPlan{Float64}, A::AbstractMatrix{Float64})
-    M, N = size(A)
-    ccall((:julia_apply_givens, rotpath), Void, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Int64, Int64), A, P.snm, P.cnm, M, N)
+function Base.A_mul_B!(P::RotationPlan, A::AbstractMatrix)
+    N, M = size(A)
+    snm = P.snm
+    cnm = P.cnm
+    @stepthreads for m = M÷2:-1:2
+        @inbounds for j = m:-2:2
+            for l = N-j:-1:1
+                s = snm[l+(j-2)*(2*N+3-j)÷2]
+                c = cnm[l+(j-2)*(2*N+3-j)÷2]
+                a1 = A[l+N*(2*m-1)]
+                a2 = A[l+2+N*(2*m-1)]
+                a3 = A[l+N*(2*m)]
+                a4 = A[l+2+N*(2*m)]
+                A[l+N*(2*m-1)] = c*a1 + s*a2
+                A[l+2+N*(2*m-1)] = c*a2 - s*a1
+                A[l+N*(2*m)] = c*a3 + s*a4
+                A[l+2+N*(2*m)] = c*a4 - s*a3
+            end
+        end
+    end
     A
 end
 
-function Base.At_mul_B!(P::RotationPlan{Float64}, A::AbstractMatrix{Float64})
-    M, N = size(A)
-    ccall((:julia_apply_givens_t, rotpath), Void, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Int64, Int64), A, P.snm, P.cnm, M, N)
+function Base.At_mul_B!(P::RotationPlan, A::AbstractMatrix)
+    N, M = size(A)
+    snm = P.snm
+    cnm = P.cnm
+    @stepthreads for m = M÷2:-1:2
+        @inbounds for j = reverse(m:-2:2)
+            for l = 1:N-j
+                s = snm[l+(j-2)*(2*N+3-j)÷2]
+                c = cnm[l+(j-2)*(2*N+3-j)÷2]
+                a1 = A[l+N*(2*m-1)]
+                a2 = A[l+2+N*(2*m-1)]
+                a3 = A[l+N*(2*m)]
+                a4 = A[l+2+N*(2*m)]
+                A[l+N*(2*m-1)] = c*a1 - s*a2
+                A[l+2+N*(2*m-1)] = c*a2 + s*a1
+                A[l+N*(2*m)] = c*a3 - s*a4
+                A[l+2+N*(2*m)] = c*a4 + s*a3
+            end
+        end
+    end
     A
 end
 
+#=
 function Base.A_mul_B!(P::RotationPlan, A::AbstractMatrix)
     M, N = size(A)
     @inbounds for m = N÷2-2:-1:0
@@ -150,6 +183,7 @@ function Base.At_mul_B!(P::RotationPlan, A::AbstractMatrix)
     end
     A
 end
+=#
 
 Base.Ac_mul_B!(P::RotationPlan, A::AbstractMatrix) = At_mul_B!(P, A)
 
 
@@ -0,0 +1,70 @@
+function _stepthreadsfor(iter,lbody)
+    lidx = iter.args[1]         # index
+    range = iter.args[2]
+    quote
+        local stepthreadsfor_fun
+        let range = $(esc(range))
+        function stepthreadsfor_fun(onethread=false)
+            r = range # Load into local variable
+            lenr = length(r)
+            # divide loop iterations among threads
+            if onethread
+                tid = 1
+                len, rem = lenr, 0
+            else
+                tid = Threads.threadid()
+                len, rem = divrem(lenr, Threads.nthreads())
+            end
+            # not enough iterations for all the threads?
+            if len == 0
+                if tid > rem
+                    return
+                end
+                len, rem = 1, 0
+            end
+            # compute this thread's iterations
+            f = tid
+            m = Threads.nthreads()
+            l = lenr
+            # run this thread's iterations
+            for i = f:m:l
+                local $(esc(lidx)) = Base.unsafe_getindex(r,i)
+                $(esc(lbody))
+            end
+        end
+        end
+        # Hack to make nested threaded loops kinda work
+        if Threads.threadid() != 1 || Threads.in_threaded_loop[]
+            # We are in a nested threaded loop
+            stepthreadsfor_fun(true)
+        else
+            Threads.in_threaded_loop[] = true
+            # the ccall is not expected to throw
+            ccall(:jl_threading_run, Ref{Void}, (Any,), stepthreadsfor_fun)
+            Threads.in_threaded_loop[] = false
+        end
+        nothing
+    end
+end
+"""
+    @stepthreads
+A macro to parallelize a for-loop to run with multiple threads. This spawns `nthreads()`
+number of threads, splits the iteration space amongst them, and iterates in parallel.
+A barrier is placed at the end of the loop which waits for all the threads to finish
+execution, and the loop returns.
+"""
+macro stepthreads(args...)
+    na = length(args)
+    if na != 1
+        throw(ArgumentError("wrong number of arguments in @stepthreads"))
+    end
+    ex = args[1]
+    if !isa(ex, Expr)
+        throw(ArgumentError("need an expression argument to @stepthreads"))
+    end
+    if ex.head === :for
+        return _stepthreadsfor(ex.args[1],ex.args[2])
+    else
+        throw(ArgumentError("unrecognized argument to @stepthreads"))
+    end
+end
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`docs/build/`
`2`	`2`	`docs/site/`
`3`		`-/.dylib`