|
| 1 | +#!/usr/bin/env python |
| 2 | +"""This is a simple example for tuning C++ OpenMP code with the kernel tuner""" |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from kernel_tuner import tune_kernel |
| 6 | +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives |
| 7 | + |
| 8 | + |
| 9 | +# Naive Python histogram implementation |
| 10 | +def histogram(vector, hist): |
| 11 | + for i in range(0, len(vector)): |
| 12 | + hist[vector[i]] += 1 |
| 13 | + return hist |
| 14 | + |
| 15 | + |
| 16 | +code = """ |
| 17 | +#include <stdlib.h> |
| 18 | +
|
| 19 | +#define HIST_SIZE 256 |
| 20 | +#define VECTOR_SIZE 1000000 |
| 21 | +
|
| 22 | +#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) |
| 23 | +#if enable_reduction == 1 |
| 24 | +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE]) |
| 25 | +#else |
| 26 | +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) |
| 27 | +#endif |
| 28 | +for ( int i = 0; i < VECTOR_SIZE; i++ ) { |
| 29 | +#if enable_atomic == 1 |
| 30 | + #pragma omp atomic update |
| 31 | +#endif |
| 32 | + hist[vector[i]] += 1; |
| 33 | +} |
| 34 | +#pragma tuner stop |
| 35 | +""" |
| 36 | + |
| 37 | +# Extract tunable directive |
| 38 | +app = Code(OpenMP(), Cxx()) |
| 39 | +kernel_string, kernel_args = process_directives(app, code) |
| 40 | + |
| 41 | +tune_params = dict() |
| 42 | +tune_params["nteams"] = [2**i for i in range(1, 11)] |
| 43 | +tune_params["nthreads"] = [32 * i for i in range(1, 33)] |
| 44 | +tune_params["enable_reduction"] = [0, 1] |
| 45 | +tune_params["enable_atomic"] = [0, 1] |
| 46 | +constraints = ["enable_reduction != enable_atomic"] |
| 47 | +metrics = dict() |
| 48 | +metrics["GB/s"] = ( |
| 49 | + lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) |
| 50 | + / (x["time"] / 10**3) |
| 51 | + / 10**9 |
| 52 | +) |
| 53 | + |
| 54 | +kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32) |
| 55 | +kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) |
| 56 | +reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) |
| 57 | +reference_hist = histogram(kernel_args["histogram"][0], reference_hist) |
| 58 | +answer = [None, reference_hist] |
| 59 | + |
| 60 | +tune_kernel( |
| 61 | + "histogram", |
| 62 | + kernel_string["histogram"], |
| 63 | + 0, |
| 64 | + kernel_args["histogram"], |
| 65 | + tune_params, |
| 66 | + restrictions=constraints, |
| 67 | + metrics=metrics, |
| 68 | + answer=answer, |
| 69 | + compiler="nvc++", |
| 70 | + compiler_options=["-fast", "-mp=gpu"], |
| 71 | +) |
0 commit comments