Skip to content

Commit 7ff9fe3

Browse files
committed
fix more problematic gpu code
1 parent 4c5d2a8 commit 7ff9fe3

File tree

6 files changed

+32
-14
lines changed

6 files changed

+32
-14
lines changed

docs/code/TraceMakie.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ begin
192192
mesh!(scene, catmesh, color=load(Makie.assetpath("diffusemap.png")))
193193
center!(scene)
194194
# 1.024328 seconds (16.94 M allocations: 5.108 GiB, 46.19% gc time, 81 lock conflicts)
195+
# 0.913530 seconds (16.93 M allocations: 5.108 GiB, 42.52% gc time, 57 lock conflicts)
195196
@time render_scene(scene)
196197
end
197198

@@ -206,5 +207,6 @@ begin
206207
surface!(scene, xs, ys, zs)
207208
center!(scene)
208209
# 1.598740s
210+
# 1.179450 seconds (17.30 M allocations: 5.126 GiB, 36.48% gc time, 94 lock conflicts)
209211
@time render_scene(scene)
210212
end

docs/code/basic-scene.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ begin
7575
img = reverse(film.framebuffer, dims=1)
7676
end
7777
# 6.296157 seconds (17.64 k allocations: 19.796 MiB, 0.13% gc time, 45 lock conflicts)
78-
78+
# After more GPU optimizations
79+
# 4.169616 seconds (17.37 k allocations: 19.777 MiB, 0.14% gc time, 20 lock conflicts)
7980

8081
camera_sample = Trace.get_camera_sample(integrator.sampler, Point2f(512))
8182
ray, ω = Trace.generate_ray_differential(integrator.camera, camera_sample)

src/accel/bvh.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ end
267267
else
268268
if dir_is_neg[ln.split_axis] == Int32(2)
269269
nodes_to_visit[to_visit_offset] = current_node_i + Int32(1)
270-
current_node_i = Int32(ln.offset)
270+
current_node_i = ln.offset % Int32
271271
else
272272
nodes_to_visit[to_visit_offset] = ln.offset % Int32
273273
current_node_i += Int32(1)
@@ -293,7 +293,7 @@ end
293293

294294
to_visit_offset, current_node_i = Int32(1), Int32(1)
295295
nodes_to_visit = zeros(MVector{64,Int32})
296-
while true
296+
@inbounds while true
297297
ln = bvh.nodes[current_node_i]
298298
if intersect_p(ln.bounds, ray, inv_dir, dir_is_neg)
299299
if !ln.is_interior && ln.n_primitives > Int32(0)

src/integrators/sampler.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ function (i::SamplerIntegrator)(scene::Scene, film)
7373
end
7474

7575
function get_material(bvh::BVHAccel, shape::Triangle)
76-
if shape.material_idx == 0
76+
@inbounds if shape.material_idx == 0
7777
return bvh.materials[1]
7878
else
7979
return bvh.materials[shape.material_idx]

src/textures/basic.jl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ function Base.convert(::Type{Texture{ElType,N,T}}, ::NoTexture) where {ElType,N,
2626
end
2727

2828
function (c::Texture{T})(si::SurfaceInteraction)::T where {T<:TextureType}
29-
if c.isconst
29+
@inbounds if c.isconst
3030
return c.const_value
3131
else
32-
uv = Vec2f(1.0-si.uv[2], si.uv[1])
33-
idx = round.(Int, 1 .+ ((size(c.data) .- 1) .* uv))
34-
idx = clamp.(idx, 1, size(c.data))
32+
uv = Vec2f(1f0 - si.uv[2], si.uv[1])
33+
s = unsafe_trunc.(Int32, size(c.data))
34+
idx = map(x -> unsafe_trunc(Int32, x), Int32(1) .+ ((s .- Int32(1)) .* uv))
35+
idx = clamp.(idx, Int32(1), s)
3536
return c.data[idx...]
3637
end
3738
end

test/gpu-threading-benchmarks.jl

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ end
100100
if ω > 0.0f0
101101
hit, shape, si = Trace.intersect!(bvh, ray)
102102
if hit
103-
l = Trace.RGBSpectrum(si.core.n...)#simple_shading(bvh, shape, ray, si, l, 1, 8, lights)
103+
l = simple_shading(bvh, shape, ray, si, l, 1, 8, lights)
104104
end
105105
end
106106
return RGBf(l.c...)
@@ -118,23 +118,37 @@ import KernelAbstractions as KA
118118
end
119119
end
120120

121+
function launch_trace_image_ir!(img, camera, bvh, lights)
122+
backend = KA.get_backend(img)
123+
kernel! = ka_trace_image!(backend)
124+
open("test2.ir", "w") do io
125+
@device_code_llvm io begin
126+
kernel!(img, camera, bvh, lights, ndrange = size(img), workgroupsize = (16, 16))
127+
end
128+
end
129+
AMDGPU.synchronize(; stop_hostcalls=false)
130+
return img
131+
end
121132
function launch_trace_image!(img, camera, bvh, lights)
122133
backend = KA.get_backend(img)
123134
kernel! = ka_trace_image!(backend)
124-
kernel!(img, camera, bvh, lights, ndrange = size(img), workgroupsize = (16, 16))
135+
kernel!(img, camera, bvh, lights, ndrange=size(img), workgroupsize=(16, 16))
125136
KA.synchronize(backend)
126137
return img
127138
end
128-
129-
using CUDA
130-
ArrayType = CuArray
139+
using AMDGPU
140+
ArrayType = ROCArray
141+
# using CUDA
142+
# ArrayType = CuArray
131143
preserve = []
132144
gpu_bvh = to_gpu(ArrayType, bvh; preserve=preserve);
133145
gpu_img = ArrayType(zeros(RGBf, res, res));
134146
# launch_trace_image!(img, cam, bvh, lights);
135147
# @btime launch_trace_image!(img, cam, bvh, lights);
136148
# @btime launch_trace_image!(gpu_img, cam, gpu_bvh, lights);
137-
@btime launch_trace_image!(gpu_img, cam, gpu_bvh, lights);
149+
launch_trace_image!(gpu_img, cam, gpu_bvh, lights);
150+
@btime launch_trace_image!(img, cam, bvh, lights)
151+
# 76.420 ms (234 allocations: 86.05 KiB)
138152
Array(gpu_img)
139153

140154
function cu_trace_image!(img, camera, bvh, lights)

0 commit comments

Comments
 (0)