Skip to content

Commit f2e55be

Browse files
committed
StaticArrays required when loading GPUArrays
1 parent 30e8c8f commit f2e55be

File tree

4 files changed

+180
-45
lines changed

4 files changed

+180
-45
lines changed

REQUIRE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
julia 0.6
2+
StaticArrays

test/REQUIRE

Lines changed: 0 additions & 1 deletion
This file was deleted.

test/pde.jl

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
using CLArrays
2+
TY = Float32
3+
N = 2^9
4+
const h = TY(2*π/N)
5+
const epsn = TY(h * .5)
6+
const C = TY(2/epsn)
7+
const tau = TY(epsn * h)
8+
Tfinal = 50.
9+
10+
S(x,y) = exp(-x^2/0.1f0)*exp(-y^2/0.1f0)
11+
12+
ArrayType = CLArray
13+
# real-space and reciprocal-space grids
14+
# the real-space grid is just used for plotting!
15+
X_cpu = convert.(TY, collect(linspace(-pi+h, pi, N)) .* ones(1,N))
16+
X = ArrayType(X_cpu);
17+
k = collect([0:N/2; -N/2+1:-1]);
18+
 = ArrayType(convert.(TY,kron(k.^2, ones(1,N)) + kron(ones(N), k'.^2)));
19+
20+
# initial condition
21+
uc = ArrayType(TY(2.0)*(rand(TY, N, N)-TY(0.5)))
22+
23+
#################################################################
24+
#################################################################
25+
26+
pow3(u) = complex((u * u * u) - u)
27+
function take_step!(u, Â, t_plot, fftplan!, ifftplan!, u3fft, uc, tmp)
28+
u3fft .= pow3.(u)
29+
fftplan! * u3fft
30+
uc .= complex.(u)
31+
fftplan! * uc
32+
@. tmp .= ((1f0+C*tau*Â) .* uc .- tau/epsn *.* u3fft)) ./ (1f0+(epsn*tau)*Â.^2f0+C*tau*Â)
33+
ifftplan! * tmp
34+
u .= real.(tmp)
35+
nothing
36+
end
37+
function normalise_af!(u, out)
38+
out .= u .- minimum(u)
39+
out .= out ./ maximum(out)
40+
nothing
41+
end
42+
#################################################################
43+
#################################################################
44+
45+
n = 1
46+
T_plot = 0.01; t_plot = 0.0
47+
ceil(Tfinal / tau)
48+
println(typeof(uc))
49+
up = copy(uc)
50+
ucc = complex.(uc)
51+
fftplan! = plan_fft!(ucc)
52+
ifftplan! = plan_ifft!(ucc)
53+
u3fft = similar(ucc)
54+
tmp = similar(ucc)
55+
56+
using GLVisualize
57+
w = glscreen(); @async renderloop(w)
58+
normalise_af!(uc,up)
59+
up .= up .* 0.1f0
60+
robj = visualize(Array(up), :surface).children[]
61+
_view(robj)
62+
GLAbstraction.update!(robj[:position_z], Array(up))
63+
64+
@time for n = 1:10000
65+
isopen(w) || break
66+
# @show n
67+
take_step!(uc, Â, t_plot, fftplan!, ifftplan!, u3fft, ucc, tmp)
68+
t_plot += tau
69+
if t_plot >= T_plot
70+
normalise_af!(uc, up)
71+
up .= up .* 0.1f0
72+
GLAbstraction.update!(robj[:position_z], Array(up))
73+
end
74+
yield()
75+
end
76+
77+
78+
79+
80+
using CLArrays, GLVisualize, GPUArrays, GLAbstraction, GeometryTypes
81+
82+
# source: https://github.com/johnfgibson/julia-pde-benchmark/blob/master/1-Kuramoto-Sivashinksy-benchmark.ipynb
83+
function inner_ks(IFFT!, FFT!, Nt, Nn, Nn1, u, G, A_inv, B, dt2, dt32, uslice, U)
84+
for n = 1:Nt
85+
Nn1 .= Nn # shift nonlinear term in time
86+
Nn .= u # put u into Nn in prep for comp of nonlinear term
87+
88+
IFFT! * Nn
89+
90+
# plotting
91+
uslice .= real.(Nn) ./ 10f0
92+
U[1:Nt, n] = reshape(Array(uslice), (Nt, 1)) # copy from gpu to opengl gpu not implemented for now
93+
94+
# transform Nn to gridpt values, in place
95+
Nn .= Nn .* Nn # collocation calculation of u^2
96+
FFT!*Nn # transform Nn back to spectral coeffs, in place
97+
98+
Nn .= G .* Nn # compute Nn == -1/2 d/dx (u^2) = -u u_x
99+
100+
# loop fusion! Julia translates the folling line of code to a single for loop.
101+
u .= A_inv .* (B .* u .+ dt32 .* Nn .- dt2 .* Nn1)
102+
yield()
103+
end
104+
GPUArrays.synchronize(u)
105+
end
106+
107+
function execute(window)
108+
109+
T = Float32; AT = CLArray
110+
N = 1512
111+
Lx = T(64*pi)
112+
Nx = T(N)
113+
dt = T(1/16)
114+
115+
x = Lx*(0:Nx-1)/Nx
116+
u = T.(cos.(x) + 0.1*sin.(x/8) + 0.01*cos.((2*pi/Lx)*x))
117+
118+
u = AT((T(1)+T(0)im)*u) # force u to be complex
119+
Nx = length(u) # number of gridpoints
120+
kx = T.(vcat(0:Nx/2-1, 0:0, -Nx/2+1:-1))# integer wavenumbers: exp(2*pi*kx*x/L)
121+
alpha = T(2)*pi*kx/Lx # real wavenumbers: exp(alpha*x)
122+
123+
D = T(1)im*alpha # spectral D = d/dx operator
124+
125+
L = alpha.^2 .- alpha.^4 # spectral L = -D^2 - D^4 operator
126+
127+
G = AT(T(-0.5) .* D) # spectral -1/2 D operator, to eval -u u_x = 1/2 d/dx u^2
128+
129+
# convenience variables
130+
dt2 = T(dt/2)
131+
dt32 = T(3*dt/2)
132+
A_inv = AT((ones(T, Nx) - dt2*L).^(-1))
133+
B = AT(ones(T, Nx) + dt2*L)
134+
135+
# compute in-place FFTW plans
136+
FFT! = plan_fft!(u)
137+
IFFT! = plan_ifft!(u)
138+
139+
# compute nonlinear term Nn == -u u_x
140+
powed = u .* u
141+
Nn = G .* fft(powed); # Nn == -1/2 d/dx (u^2) = -u u_x
142+
Nn1 = copy(Nn); # Nn1 = Nn at first time step
143+
FFT! * u;
144+
145+
uslice = real(u)
146+
U = zeros(Float32, N, N)
147+
robj = visualize(
148+
U, :surface, color_norm = Vec2f0(-0.5, 0.5),
149+
ranges = ((-3f0, 3f0), (-3f0, 3f0))
150+
).children[]
151+
Ugpu = robj[:position_z]
152+
_view(robj)
153+
154+
bench = inner_ks((IFFT!), (FFT!), N, Nn, Nn1, u, G, A_inv, B, dt2, dt32, uslice, Ugpu)
155+
end
156+
157+
w = glscreen(); @async renderloop(w)
158+
159+
execute(w)

test/shared.jl

Lines changed: 20 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import SpecialFunctions: erfc
2-
using GPUArrays: gpu_sub2ind
2+
using CLArrays
33

44
function blackscholes(sptprice, strike, rate, volatility, time)
55
logterm = log( sptprice / strike)
@@ -20,49 +20,25 @@ function cndf2(x)
2020
0.5f0 + 0.5f0 * erfc(0.707106781f0 * x)
2121
end
2222

23-
# jeeez -.- So CUDAnative still doesn't recognize e.g. sqrt in the LLVM-IR,
24-
#since it's implemented within a C-library.... Should be fixed soon!
25-
function cu_blackscholes(sptprice, strike, rate, volatility, time)
26-
logterm = cu.log(sptprice / strike)
27-
powterm = .5f0 * volatility * volatility
28-
den = volatility * cu.sqrt(time)
29-
d1 = (((rate + powterm) * time) + logterm) / den
30-
d2 = d1 - den
31-
NofXd1 = cu_cndf2(d1)
32-
NofXd2 = cu_cndf2(d2)
33-
futureValue = strike * cu.exp(- rate * time)
34-
c1 = futureValue * NofXd2
35-
call_ = sptprice * NofXd1 - c1
36-
put = call_ - futureValue + sptprice
37-
return put
38-
end
23+
using BenchmarkTools
3924

40-
function cu_cndf2(x)
41-
0.5f0 + 0.5f0 * cu.erfc(0.707106781f0 * x)
42-
end
25+
T = Float32
26+
N = 10^7
27+
sptprice = T[42.0 for i = 1:N]
28+
initStrike = T[40.0 + (i / N) for i = 1:N]
29+
rate = T[0.5 for i = 1:N]
30+
volatility = T[0.2 for i = 1:N]
31+
time = T[0.5 for i = 1:N]
32+
result = similar(time)
33+
comparison = blackscholes.(sptprice, initStrike, rate, volatility, time)
34+
_sptprice = CLArray(sptprice)
35+
_initStrike = CLArray(initStrike)
36+
_rate = CLArray(rate)
37+
_volatility = CLArray(volatility)
38+
_time = CLArray(time)
39+
_result = CLArray(result)
4340

44-
for T in (Float32,)
45-
N = 60
46-
sptprice = T[42.0 for i = 1:N]
47-
initStrike = T[40.0 + (i / N) for i = 1:N]
48-
rate = T[0.5 for i = 1:N]
49-
volatility = T[0.2 for i = 1:N]
50-
time = T[0.5 for i = 1:N]
51-
result = similar(time)
52-
comparison = blackscholes.(sptprice, initStrike, rate, volatility, time)
53-
@allbackends "Blackscholes with $T" backend begin
54-
_sptprice = GPUArray(sptprice)
55-
_initStrike = GPUArray(initStrike)
56-
_rate = GPUArray(rate)
57-
_volatility = GPUArray(volatility)
58-
_time = GPUArray(time)
59-
_result = GPUArray(result)
60-
blackschole_f = if is_cudanative(backend)
61-
cu_blackscholes
62-
else
63-
blackscholes
64-
end
65-
_result .= blackschole_f.(_sptprice, _initStrike, _rate, _volatility, _time)
66-
@test Array(_result) comparison
67-
end
41+
@time begin
42+
_result .= blackscholes.(_sptprice, _initStrike, _rate, _volatility, _time)
43+
(GPUArrays.synchronize)(_result)
6844
end

0 commit comments

Comments
 (0)