Skip to content

Commit fcd801a

Browse files
committed
implement GPU codegen helpers
1 parent e1f1e22 commit fcd801a

File tree

6 files changed

+257
-15
lines changed

6 files changed

+257
-15
lines changed

pyop2/compilation.py

Lines changed: 103 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,29 @@ def sniff_compiler(exe, comm=mpi.COMM_WORLD):
154154
return comm.bcast(compiler, 0)
155155

156156

157+
def _check_src_hashes(comm, global_kernel):
158+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
159+
basename = hsh.hexdigest()
160+
dirpart, basename = basename[:2], basename[2:]
161+
cachedir = configuration["cache_dir"]
162+
cachedir = os.path.join(cachedir, dirpart)
163+
164+
if configuration["check_src_hashes"] or configuration["debug"]:
165+
matching = comm.allreduce(basename, op=_check_op)
166+
if matching != basename:
167+
# Dump all src code to disk for debugging
168+
output = os.path.join(cachedir, "mismatching-kernels")
169+
srcfile = os.path.join(output, "src-rank%d.c" % comm.rank)
170+
if comm.rank == 0:
171+
os.makedirs(output, exist_ok=True)
172+
comm.barrier()
173+
with open(srcfile, "w") as f:
174+
f.write(global_kernel.code_to_compile)
175+
comm.barrier()
176+
raise CompilationError("Generated code differs across ranks"
177+
f" (see output in {output})")
178+
179+
157180
class Compiler(ABC):
158181
"""A compiler for shared libraries.
159182
@@ -324,19 +347,8 @@ def get_so(self, jitmodule, extension):
324347
# atomically (avoiding races).
325348
tmpname = os.path.join(cachedir, "%s_p%d.so.tmp" % (basename, pid))
326349

327-
if configuration['check_src_hashes'] or configuration['debug']:
328-
matching = self.comm.allreduce(basename, op=_check_op)
329-
if matching != basename:
330-
# Dump all src code to disk for debugging
331-
output = os.path.join(configuration["cache_dir"], "mismatching-kernels")
332-
srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank)
333-
if self.comm.rank == 0:
334-
os.makedirs(output, exist_ok=True)
335-
self.comm.barrier()
336-
with open(srcfile, "w") as f:
337-
f.write(jitmodule.code_to_compile)
338-
self.comm.barrier()
339-
raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
350+
_check_src_hashes(self.comm, jitmodule)
351+
340352
try:
341353
# Are we in the cache?
342354
return ctypes.CDLL(soname)
@@ -652,3 +664,81 @@ def clear_cache(prompt=False):
652664
shutil.rmtree(cachedir, ignore_errors=True)
653665
else:
654666
print("Not removing cached libraries")
667+
668+
669+
def _get_code_to_compile(comm, global_kernel):
670+
# Determine cache key
671+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
672+
basename = hsh.hexdigest()
673+
cachedir = configuration["cache_dir"]
674+
dirpart, basename = basename[:2], basename[2:]
675+
cachedir = os.path.join(cachedir, dirpart)
676+
cname = os.path.join(cachedir, f"{basename}_code.cu")
677+
678+
_check_src_hashes(comm, global_kernel)
679+
680+
if os.path.isfile(cname):
681+
# Are we in the cache?
682+
with open(cname, "r") as f:
683+
code_to_compile = f.read()
684+
else:
685+
# No, let"s go ahead and build
686+
if comm.rank == 0:
687+
# No need to do this on all ranks
688+
os.makedirs(cachedir, exist_ok=True)
689+
with progress(INFO, "Compiling wrapper"):
690+
# make sure that compiles successfully before writing to file
691+
code_to_compile = global_kernel.code_to_compile
692+
with open(cname, "w") as f:
693+
f.write(code_to_compile)
694+
comm.barrier()
695+
696+
return code_to_compile
697+
698+
699+
@mpi.collective
700+
def get_prepared_cuda_function(comm, global_kernel):
701+
from pycuda.compiler import SourceModule
702+
703+
# Determine cache key
704+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
705+
basename = hsh.hexdigest()
706+
cachedir = configuration["cache_dir"]
707+
dirpart, basename = basename[:2], basename[2:]
708+
cachedir = os.path.join(cachedir, dirpart)
709+
710+
nvcc_opts = ["-use_fast_math", "-w"]
711+
712+
code_to_compile = _get_code_to_compile(comm, global_kernel)
713+
source_module = SourceModule(code_to_compile, options=nvcc_opts,
714+
cache_dir=cachedir)
715+
716+
cu_func = source_module.get_function(global_kernel.name)
717+
718+
type_map = {ctypes.c_void_p: "P", ctypes.c_int: "i"}
719+
argtypes = "".join(type_map[t] for t in global_kernel.argtypes)
720+
cu_func.prepare(argtypes)
721+
722+
return cu_func
723+
724+
725+
@mpi.collective
726+
def get_opencl_kernel(comm, global_kernel):
727+
import pyopencl as cl
728+
from pyop2.backends.opencl import opencl_backend
729+
cl_ctx = opencl_backend.context
730+
731+
# Determine cache key
732+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
733+
basename = hsh.hexdigest()
734+
cachedir = configuration["cache_dir"]
735+
dirpart, basename = basename[:2], basename[2:]
736+
cachedir = os.path.join(cachedir, dirpart)
737+
738+
code_to_compile = _get_code_to_compile(comm, global_kernel)
739+
740+
prg = cl.Program(cl_ctx, code_to_compile).build(options=[],
741+
cache_dir=cachedir)
742+
743+
cl_knl = cl.Kernel(prg, global_kernel.name)
744+
return cl_knl

pyop2/configuration.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ class Configuration(dict):
7474
cdim > 1 be built as block sparsities, or dof sparsities. The
7575
former saves memory but changes which preconditioners are
7676
available for the resulting matrices. (Default yes)
77+
:param gpu_strategy: A :class:str` indicating the transformation strategy
78+
that must be applied to a :class:`pyop2.global_kernel.GlobalKernel`
79+
when offloading to a GPGPU. Can be one of:
80+
- ``"snpt"``: Single-"N" Per Thread. In the transform strategy, the
81+
work of each element of the iteration set over which a global kernel
82+
operates is assigned to a work-item (i.e. a CUDA thread)
7783
"""
7884
# name, env variable, type, default, write once
7985
cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid())
@@ -113,7 +119,9 @@ class Configuration(dict):
113119
"matnest":
114120
("PYOP2_MATNEST", bool, True),
115121
"block_sparsity":
116-
("PYOP2_BLOCK_SPARSITY", bool, True)
122+
("PYOP2_BLOCK_SPARSITY", bool, True),
123+
"gpu_strategy":
124+
("PYOP2_GPU_STRATEGY", str, "snpt"),
117125
}
118126
"""Default values for PyOP2 configuration parameters"""
119127

pyop2/transforms/__init__.py

Whitespace-only changes.

pyop2/transforms/gpu_utils.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import loopy as lp
2+
from pyop2.configuration import configuration
3+
4+
5+
def get_loopy_target(target):
6+
if target == "opencl":
7+
return lp.PyOpenCLTarget()
8+
elif target == "cuda":
9+
return lp.CudaTarget()
10+
else:
11+
raise NotImplementedError()
12+
13+
14+
def preprocess_t_unit_for_gpu(t_unit):
15+
16+
# {{{ inline all kernels in t_unit
17+
18+
kernels_to_inline = {
19+
name for name, clbl in t_unit.callables_table.items()
20+
if isinstance(clbl, lp.CallableKernel)}
21+
22+
for knl_name in kernels_to_inline:
23+
t_unit = lp.inline_callable_kernel(t_unit, knl_name)
24+
25+
# }}}
26+
27+
kernel = t_unit.default_entrypoint
28+
29+
# changing the address space of temps
30+
def _change_aspace_tvs(tv):
31+
if tv.read_only:
32+
assert tv.initializer is not None
33+
return tv.copy(address_space=lp.AddressSpace.GLOBAL)
34+
else:
35+
return tv.copy(address_space=lp.AddressSpace.PRIVATE)
36+
37+
new_tvs = {tv_name: _change_aspace_tvs(tv) for tv_name, tv in
38+
kernel.temporary_variables.items()}
39+
kernel = kernel.copy(temporary_variables=new_tvs)
40+
41+
def insn_needs_atomic(insn):
42+
# updates to global variables are atomic
43+
import pymbolic
44+
if isinstance(insn, lp.Assignment):
45+
if isinstance(insn.assignee, pymbolic.primitives.Subscript):
46+
assignee_name = insn.assignee.aggregate.name
47+
else:
48+
assert isinstance(insn.assignee, pymbolic.primitives.Variable)
49+
assignee_name = insn.assignee.name
50+
51+
if assignee_name in kernel.arg_dict:
52+
return assignee_name in insn.read_dependency_names()
53+
return False
54+
55+
new_insns = []
56+
args_marked_for_atomic = set()
57+
for insn in kernel.instructions:
58+
if insn_needs_atomic(insn):
59+
atomicity = (lp.AtomicUpdate(insn.assignee.aggregate.name), )
60+
insn = insn.copy(atomicity=atomicity)
61+
args_marked_for_atomic |= set([insn.assignee.aggregate.name])
62+
63+
new_insns.append(insn)
64+
65+
# label args as atomic
66+
new_args = []
67+
for arg in kernel.args:
68+
if arg.name in args_marked_for_atomic:
69+
new_args.append(arg.copy(for_atomic=True))
70+
else:
71+
new_args.append(arg)
72+
73+
kernel = kernel.copy(instructions=new_insns, args=new_args)
74+
75+
return t_unit.with_kernel(kernel)
76+
77+
78+
def apply_gpu_transforms(t_unit, target):
79+
t_unit = t_unit.copy(target=get_loopy_target(target))
80+
t_unit = preprocess_t_unit_for_gpu(t_unit)
81+
kernel = t_unit.default_entrypoint
82+
transform_strategy = configuration["gpu_strategy"]
83+
84+
kernel = lp.assume(kernel, "end > start")
85+
86+
if transform_strategy == "snpt":
87+
from pyop2.transforms.snpt import split_n_across_workgroups
88+
kernel, args_to_make_global = split_n_across_workgroups(kernel, 32)
89+
else:
90+
raise NotImplementedError(f"'{transform_strategy}' transform strategy.")
91+
92+
t_unit = t_unit.with_kernel(kernel)
93+
94+
return t_unit, args_to_make_global

pyop2/transforms/snpt.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import loopy as lp
2+
3+
4+
def _make_tv_array_arg(tv):
5+
assert tv.address_space != lp.AddressSpace.PRIVATE
6+
arg = lp.ArrayArg(name=tv.name,
7+
dtype=tv.dtype,
8+
shape=tv.shape,
9+
dim_tags=tv.dim_tags,
10+
offset=tv.offset,
11+
dim_names=tv.dim_names,
12+
order=tv.order,
13+
alignment=tv.alignment,
14+
address_space=tv.address_space,
15+
is_output=not tv.read_only,
16+
is_input=tv.read_only)
17+
return arg
18+
19+
20+
def split_n_across_workgroups(kernel, workgroup_size):
21+
"""
22+
Returns a transformed version of *kernel* with the workload in the loop
23+
with induction variable 'n' distributed across work-groups of size
24+
*workgroup_size* and each work-item in the work-group performing the work
25+
of a single iteration of 'n'.
26+
"""
27+
28+
kernel = lp.assume(kernel, "start < end")
29+
kernel = lp.split_iname(kernel, "n", workgroup_size,
30+
outer_tag="g.0", inner_tag="l.0")
31+
32+
# {{{ making consts as globals: necessary to make the strategy emit valid
33+
# kernels for all forms
34+
35+
old_temps = kernel.temporary_variables.copy()
36+
args_to_make_global = [tv.initializer.flatten()
37+
for tv in old_temps.values()
38+
if tv.initializer is not None]
39+
40+
new_temps = {tv.name: tv
41+
for tv in old_temps.values()
42+
if tv.initializer is None}
43+
kernel = kernel.copy(args=kernel.args+[_make_tv_array_arg(tv)
44+
for tv in old_temps.values()
45+
if tv.initializer is not None],
46+
temporary_variables=new_temps)
47+
48+
# }}}
49+
50+
return kernel, args_to_make_global

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def run(self):
139139
'Programming Language :: Python :: 3.6',
140140
],
141141
install_requires=install_requires + test_requires,
142-
packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types'],
142+
packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types', 'pyop2.transforms'],
143143
package_data={
144144
'pyop2': ['assets/*', '*.h', '*.pxd', '*.pyx', 'codegen/c/*.c']},
145145
scripts=glob('scripts/*'),

0 commit comments

Comments
 (0)