Skip to content

Commit 303ef3a

Browse files
committed
Add pnpoly cupy example
1 parent 4719acd commit 303ef3a

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed

examples/cuda/pnpoly_cupy.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python
2+
""" Point-in-Polygon host/device code tuner
3+
4+
This program is used for auto-tuning the host and device code of a CUDA program
5+
for computing the point-in-polygon problem for very large datasets and large
6+
polygons.
7+
8+
The time measurements used as a basis for tuning include the time spent on
9+
data transfers between host and device memory. The host code uses device mapped
10+
host memory to overlap communication between host and device with kernel
11+
execution on the GPU. Because each input is read only once and each output
12+
is written only once, this implementation almost fully overlaps all
13+
communication and the kernel execution time dominates the total execution time.
14+
15+
The code has the option to precompute all polygon line slopes on the CPU and
16+
reuse those results on the GPU, instead of recomputing them on the GPU all
17+
the time. The time spent on precomputing these values on the CPU is also
18+
taken into account by the time measurement in the code.
19+
20+
This code was written for use with the Kernel Tuner. See:
21+
https://github.com/benvanwerkhoven/kernel_tuner
22+
23+
Author: Ben van Werkhoven <[email protected]>
24+
"""
25+
from collections import OrderedDict
26+
import json
27+
import logging
28+
29+
import cupy as cp
30+
import cupyx as cpx
31+
import kernel_tuner
32+
import numpy
33+
34+
35+
def allocator(size: int) -> cp.cuda.PinnedMemoryPointer:
36+
"""Allocate context-portable device mapped host memory."""
37+
flags = cp.cuda.runtime.hostAllocPortable | cp.cuda.runtime.hostAllocMapped
38+
mem = cp.cuda.PinnedMemory(size, flags=flags)
39+
return cp.cuda.PinnedMemoryPointer(mem, offset=0)
40+
41+
42+
def tune():
43+
44+
#set the number of points and the number of vertices
45+
size = numpy.int32(2e7)
46+
problem_size = (size, 1)
47+
vertices = 600
48+
49+
#allocate context-portable device mapped host memory
50+
cp.cuda.set_pinned_memory_allocator(allocator)
51+
52+
#generate input data
53+
points = cpx.empty_pinned(shape=(2*size,), dtype=numpy.float32)
54+
points[:] = numpy.random.randn(2*size).astype(numpy.float32)
55+
56+
bitmap = cpx.zeros_pinned(shape=(size,), dtype=numpy.int32)
57+
#as test input we use a circle with radius 1 as polygon and
58+
#a large set of normally distributed points around 0,0
59+
vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]
60+
vertex_x = numpy.cos(vertex_seeds)
61+
vertex_y = numpy.sin(vertex_seeds)
62+
vertex_xy = cpx.empty_pinned(shape=(2*vertices,), dtype=numpy.float32)
63+
vertex_xy[:] = numpy.array( list(zip(vertex_x, vertex_y)) ).astype(numpy.float32).ravel()
64+
65+
#kernel arguments
66+
args = [bitmap, points, vertex_xy, size]
67+
68+
#setup tunable parameters
69+
tune_params = OrderedDict()
70+
tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32
71+
tune_params["tile_size"] = [1] + [2*i for i in range(1,11)]
72+
tune_params["between_method"] = [0, 1, 2, 3]
73+
tune_params["use_precomputed_slopes"] = [0, 1]
74+
tune_params["use_method"] = [0, 1]
75+
76+
#tell the Kernel Tuner how to compute the grid dimensions from the problem_size
77+
grid_div_x = ["block_size_x", "tile_size"]
78+
79+
#start tuning
80+
results = kernel_tuner.tune_kernel("cn_pnpoly_host", ['pnpoly_host.cu', 'pnpoly.cu'],
81+
problem_size, args, tune_params,
82+
grid_div_x=grid_div_x, lang="C", compiler_options=["-arch=sm_52"], verbose=True, log=logging.DEBUG)
83+
84+
return results
85+
86+
87+
if __name__ == "__main__":
88+
results = tune()
89+
with open("pnpoly.json", 'w') as fp:
90+
json.dump(results, fp)

0 commit comments

Comments
 (0)