Skip to content

Commit 1cbd6c9

Browse files
authored
Merge pull request #186 from omlins/warp1
Add support for architecture-agnostic warp-level functions
2 parents d3b6567 + 1d571ce commit 1cbd6c9

14 files changed

+1785
-1105
lines changed

examples/diffusion2D_shmem_novis.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ end
1313
ty = @threadIdx().y + 1
1414
T_l = @sharedMem(eltype(T), (@blockDim().x+2, @blockDim().y+2))
1515
T_l[tx,ty] = T[ix,iy]
16-
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
16+
if (1<ix<size(T2,1) && 1<iy<size(T2,2))
1717
if (@threadIdx().x == 1) T_l[tx-1,ty] = T[ix-1,iy] end
1818
if (@threadIdx().x == @blockDim().x) T_l[tx+1,ty] = T[ix+1,iy] end
1919
if (@threadIdx().y == 1) T_l[tx,ty-1] = T[ix,iy-1] end

examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ else
88
end
99

1010
@parallel_indices (ix,iy,iz) function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz)
11-
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2) && iz>1 && iz<size(T2,3))
11+
if (1<ix<size(T2,1) && 1<iy<size(T2,2) && 1<iz<size(T2,3))
1212
T2[ix,iy,iz] = T[ix,iy,iz] + dt*(Ci[ix,iy,iz]*(
1313
- ((-lam*(T[ix+1,iy,iz] - T[ix,iy,iz])*_dx) - (-lam*(T[ix,iy,iz] - T[ix-1,iy,iz])*_dx))*_dx
1414
- ((-lam*(T[ix,iy+1,iz] - T[ix,iy,iz])*_dy) - (-lam*(T[ix,iy,iz] - T[ix,iy-1,iz])*_dy))*_dy

src/ParallelKernel/ParallelKernel.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,17 @@ Enables writing parallel high-performance kernels and whole applications that ca
3232
- [`@threadIdx`](@ref)
3333
- [`@sync_threads`](@ref)
3434
- [`@sharedMem`](@ref)
35+
!!! note "Warp-level primitives"
36+
- [`@warpsize`](@ref)
37+
- [`@laneid`](@ref)
38+
- [`@active_mask`](@ref)
39+
- [`@shfl_sync`](@ref)
40+
- [`@shfl_up_sync`](@ref)
41+
- [`@shfl_down_sync`](@ref)
42+
- [`@shfl_xor_sync`](@ref)
43+
- [`@vote_any_sync`](@ref)
44+
- [`@vote_all_sync`](@ref)
45+
- [`@vote_ballot_sync`](@ref)
3546
3647
# Submodules
3748
- [`ParallelKernel.AD`](@ref)
@@ -74,6 +85,7 @@ include("FieldAllocators.jl")
7485
## Exports
7586
export @init_parallel_kernel, @parallel, @hide_communication, @parallel_indices, @parallel_async, @synchronize, @zeros, @ones, @rand, @falses, @trues, @fill, @fill!, @CellType
7687
export @gridDim, @blockIdx, @blockDim, @threadIdx, @sync_threads, @sharedMem, @pk_show, @pk_println, @∀
88+
export @warpsize, @laneid, @active_mask, @shfl_sync, @shfl_up_sync, @shfl_down_sync, @shfl_xor_sync, @vote_any_sync, @vote_all_sync, @vote_ballot_sync
7789
export PKNumber
7890

7991
end # Module ParallelKernel

src/ParallelKernel/kernel_language.jl

Lines changed: 304 additions & 0 deletions
Large diffs are not rendered by default.

src/ParallelStencil.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,17 @@ https://github.com/omlins/ParallelStencil.jl
3232
- [`@threadIdx`](@ref)
3333
- [`@sync_threads`](@ref)
3434
- [`@sharedMem`](@ref)
35+
!!! note "Warp-level primitives"
36+
- [`@warpsize`](@ref)
37+
- [`@laneid`](@ref)
38+
- [`@active_mask`](@ref)
39+
- [`@shfl_sync`](@ref)
40+
- [`@shfl_up_sync`](@ref)
41+
- [`@shfl_down_sync`](@ref)
42+
- [`@shfl_xor_sync`](@ref)
43+
- [`@vote_any_sync`](@ref)
44+
- [`@vote_all_sync`](@ref)
45+
- [`@vote_ballot_sync`](@ref)
3546
3647
# Submodules
3748
- [`ParallelStencil.AD`](@ref)
@@ -60,8 +71,11 @@ using .ParallelKernel.Exceptions
6071
include("shared.jl")
6172

6273
## Alphabetical include of function files
74+
include("allocators.jl")
75+
include("hide_communication.jl")
6376
include("init_parallel_stencil.jl")
6477
include("kernel_language.jl")
78+
include("memopt.jl")
6579
include("parallel.jl")
6680
include("reset_parallel_stencil.jl")
6781

@@ -74,6 +88,7 @@ include("FiniteDifferences.jl")
7488
export @init_parallel_stencil, FiniteDifferences1D, FiniteDifferences2D, FiniteDifferences3D, AD
7589
export @parallel, @hide_communication, @parallel_indices, @parallel_async, @synchronize, @zeros, @ones, @rand, @falses, @trues, @fill, @fill!, @CellType
7690
export @gridDim, @blockIdx, @blockDim, @threadIdx, @sync_threads, @sharedMem, @ps_show, @ps_println, @∀
91+
export @warpsize, @laneid, @active_mask, @shfl_sync, @shfl_up_sync, @shfl_down_sync, @shfl_xor_sync, @vote_any_sync, @vote_all_sync, @vote_ballot_sync
7792
export PSNumber
7893

7994
end # Module ParallelStencil

src/allocators.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@doc replace(ParallelKernel.ZEROS_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro zeros(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@zeros($(args...)))); end
2+
@doc replace(ParallelKernel.ONES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro ones(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@ones($(args...)))); end
3+
@doc replace(ParallelKernel.RAND_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro rand(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@rand($(args...)))); end
4+
@doc replace(ParallelKernel.FALSES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro falses(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@falses($(args...)))); end
5+
@doc replace(ParallelKernel.TRUES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro trues(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@trues($(args...)))); end
6+
@doc replace(ParallelKernel.FILL_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro fill(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill($(args...)))); end
7+
@doc replace(ParallelKernel.FILL!_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro fill!(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill!($(args...)))); end
8+
@doc replace(ParallelKernel.CELLTYPE_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro CellType(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@CellType($(args...)))); end

src/hide_communication.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
@doc replace(ParallelKernel.HIDE_COMMUNICATION_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro hide_communication(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@hide_communication($(args...)))); end

src/init_parallel_stencil.jl

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,3 @@
1-
# NOTE: @parallel and @parallel_indices and @parallel_async do not appear in the following as they are extended and therefore defined in parallel.jl
2-
@doc replace(ParallelKernel.HIDE_COMMUNICATION_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro hide_communication(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@hide_communication($(args...)))); end
3-
@doc replace(ParallelKernel.ZEROS_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro zeros(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@zeros($(args...)))); end
4-
@doc replace(ParallelKernel.ONES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro ones(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@ones($(args...)))); end
5-
@doc replace(ParallelKernel.RAND_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro rand(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@rand($(args...)))); end
6-
@doc replace(ParallelKernel.FALSES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro falses(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@falses($(args...)))); end
7-
@doc replace(ParallelKernel.TRUES_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro trues(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@trues($(args...)))); end
8-
@doc replace(ParallelKernel.FILL_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro fill(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill($(args...)))); end
9-
@doc replace(ParallelKernel.FILL!_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro fill!(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill!($(args...)))); end
10-
@doc replace(ParallelKernel.CELLTYPE_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro CellType(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@CellType($(args...)))); end
11-
@doc replace(ParallelKernel.SYNCHRONIZE_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro synchronize(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@synchronize($(args...)))); end
12-
@doc replace(ParallelKernel.GRIDDIM_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro gridDim(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@gridDim($(args...)))); end
13-
@doc replace(ParallelKernel.BLOCKIDX_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro blockIdx(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockIdx($(args...)))); end
14-
@doc replace(ParallelKernel.BLOCKDIM_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro blockDim(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockDim($(args...)))); end
15-
@doc replace(ParallelKernel.THREADIDX_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro threadIdx(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@threadIdx($(args...)))); end
16-
@doc replace(ParallelKernel.SYNCTHREADS_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro sync_threads(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sync_threads($(args...)))); end
17-
@doc replace(ParallelKernel.SHAREDMEM_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro sharedMem(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sharedMem($(args...)))); end
18-
@doc replace(ParallelKernel.FORALL_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro ∀(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@∀($(args...)))); end
19-
@doc replace(replace(ParallelKernel.PKSHOW_DOC, "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_show" => "ps_show") macro ps_show(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_show($(args...)))); end
20-
@doc replace(replace(ParallelKernel.PKPRINTLN_DOC, "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_println" => "ps_println") macro ps_println(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_println($(args...)))); end
21-
22-
231
"""
242
@init_parallel_stencil(package, numbertype, ndims)
253
@init_parallel_stencil(package, numbertype, ndims, inbounds=...)

0 commit comments

Comments
 (0)