|
| 1 | +#!/usr/bin/env python |
| 2 | +""" Point-in-Polygon host/device code tuner |
| 3 | +
|
| 4 | +This program is used for auto-tuning the host and device code of a CUDA program |
| 5 | +for computing the point-in-polygon problem for very large datasets and large |
| 6 | +polygons. |
| 7 | +
|
| 8 | +The time measurements used as a basis for tuning include the time spent on |
| 9 | +data transfers between host and device memory. The host code uses device mapped |
| 10 | +host memory to overlap communication between host and device with kernel |
| 11 | +execution on the GPU. Because each input is read only once and each output |
| 12 | +is written only once, this implementation almost fully overlaps all |
| 13 | +communication and the kernel execution time dominates the total execution time. |
| 14 | +
|
| 15 | +The code has the option to precompute all polygon line slopes on the CPU and |
| 16 | +reuse those results on the GPU, instead of recomputing them on the GPU all |
| 17 | +the time. The time spent on precomputing these values on the CPU is also |
| 18 | +taken into account by the time measurement in the code. |
| 19 | +
|
| 20 | +This code was written for use with the Kernel Tuner. See: |
| 21 | + https://github.com/benvanwerkhoven/kernel_tuner |
| 22 | +
|
| 23 | +Author: Ben van Werkhoven <[email protected]> |
| 24 | +""" |
| 25 | +from collections import OrderedDict |
| 26 | +import json |
| 27 | +import logging |
| 28 | + |
| 29 | +import cupy as cp |
| 30 | +import cupyx as cpx |
| 31 | +import kernel_tuner |
| 32 | +import numpy |
| 33 | + |
| 34 | + |
| 35 | +def allocator(size: int) -> cp.cuda.PinnedMemoryPointer: |
| 36 | + """Allocate context-portable device mapped host memory.""" |
| 37 | + flags = cp.cuda.runtime.hostAllocPortable | cp.cuda.runtime.hostAllocMapped |
| 38 | + mem = cp.cuda.PinnedMemory(size, flags=flags) |
| 39 | + return cp.cuda.PinnedMemoryPointer(mem, offset=0) |
| 40 | + |
| 41 | + |
| 42 | +def tune(): |
| 43 | + |
| 44 | + #set the number of points and the number of vertices |
| 45 | + size = numpy.int32(2e7) |
| 46 | + problem_size = (size, 1) |
| 47 | + vertices = 600 |
| 48 | + |
| 49 | + #allocate context-portable device mapped host memory |
| 50 | + cp.cuda.set_pinned_memory_allocator(allocator) |
| 51 | + |
| 52 | + #generate input data |
| 53 | + points = cpx.empty_pinned(shape=(2*size,), dtype=numpy.float32) |
| 54 | + points[:] = numpy.random.randn(2*size).astype(numpy.float32) |
| 55 | + |
| 56 | + bitmap = cpx.zeros_pinned(shape=(size,), dtype=numpy.int32) |
| 57 | + #as test input we use a circle with radius 1 as polygon and |
| 58 | + #a large set of normally distributed points around 0,0 |
| 59 | + vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] |
| 60 | + vertex_x = numpy.cos(vertex_seeds) |
| 61 | + vertex_y = numpy.sin(vertex_seeds) |
| 62 | + vertex_xy = cpx.empty_pinned(shape=(2*vertices,), dtype=numpy.float32) |
| 63 | + vertex_xy[:] = numpy.array( list(zip(vertex_x, vertex_y)) ).astype(numpy.float32).ravel() |
| 64 | + |
| 65 | + #kernel arguments |
| 66 | + args = [bitmap, points, vertex_xy, size] |
| 67 | + |
| 68 | + #setup tunable parameters |
| 69 | + tune_params = OrderedDict() |
| 70 | + tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 |
| 71 | + tune_params["tile_size"] = [1] + [2*i for i in range(1,11)] |
| 72 | + tune_params["between_method"] = [0, 1, 2, 3] |
| 73 | + tune_params["use_precomputed_slopes"] = [0, 1] |
| 74 | + tune_params["use_method"] = [0, 1] |
| 75 | + |
| 76 | + #tell the Kernel Tuner how to compute the grid dimensions from the problem_size |
| 77 | + grid_div_x = ["block_size_x", "tile_size"] |
| 78 | + |
| 79 | + #start tuning |
| 80 | + results = kernel_tuner.tune_kernel("cn_pnpoly_host", ['pnpoly_host.cu', 'pnpoly.cu'], |
| 81 | + problem_size, args, tune_params, |
| 82 | + grid_div_x=grid_div_x, lang="C", compiler_options=["-arch=sm_52"], verbose=True, log=logging.DEBUG) |
| 83 | + |
| 84 | + return results |
| 85 | + |
| 86 | + |
| 87 | +if __name__ == "__main__": |
| 88 | + results = tune() |
| 89 | + with open("pnpoly.json", 'w') as fp: |
| 90 | + json.dump(results, fp) |
0 commit comments