|
26 | 26 | import kernel_tuner |
27 | 27 | from collections import OrderedDict |
28 | 28 |
|
| 29 | + |
29 | 30 | def tune(): |
30 | | - with open('convolution.cu', 'r') as f: |
| 31 | + with open("convolution.cu", "r") as f: |
31 | 32 | kernel_string = f.read() |
32 | 33 |
|
33 | 34 | filter_size = (17, 17) |
34 | 35 | problem_size = (4096, 4096) |
35 | 36 | size = numpy.prod(problem_size) |
36 | | - border_size = (filter_size[0]//2*2, filter_size[1]//2*2) |
37 | | - input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1])) |
| 37 | + border_size = (filter_size[0] // 2 * 2, filter_size[1] // 2 * 2) |
| 38 | + input_size = (problem_size[0] + border_size[0]) * (problem_size[1] + border_size[1]) |
38 | 39 |
|
39 | 40 | output = numpy.zeros(size).astype(numpy.float32) |
40 | 41 | input = numpy.random.randn(input_size).astype(numpy.float32) |
41 | 42 |
|
42 | | - filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32) |
43 | | - cmem_args= {'d_filter': filter } |
| 43 | + filter = numpy.random.randn(filter_size[0] * filter_size[1]).astype(numpy.float32) |
| 44 | + cmem_args = {"d_filter": filter} |
44 | 45 |
|
45 | 46 | args = [output, input, filter] |
46 | 47 | tune_params = OrderedDict() |
47 | 48 | tune_params["filter_width"] = [filter_size[0]] |
48 | 49 | tune_params["filter_height"] = [filter_size[1]] |
49 | 50 |
|
50 | | - #tune_params["block_size_x"] = [16*i for i in range(1,3)] |
51 | | - tune_params["block_size_x"] = [16*i for i in range(1,9)] |
52 | | - #tune_params["block_size_y"] = [2**i for i in range(1,5)] |
53 | | - tune_params["block_size_y"] = [2**i for i in range(1,6)] |
| 51 | + # tune_params["block_size_x"] = [16*i for i in range(1,3)] |
| 52 | + tune_params["block_size_x"] = [16 * i for i in range(1, 9)] |
| 53 | + # tune_params["block_size_y"] = [2**i for i in range(1,5)] |
| 54 | + tune_params["block_size_y"] = [2**i for i in range(1, 6)] |
54 | 55 |
|
55 | 56 | tune_params["tile_size_x"] = [2**i for i in range(3)] |
56 | 57 | tune_params["tile_size_y"] = [2**i for i in range(3)] |
57 | 58 |
|
58 | | - tune_params["use_padding"] = [0,1] #toggle the insertion of padding in shared memory |
59 | | - tune_params["read_only"] = [0,1] #toggle using the read-only cache |
| 59 | + tune_params["use_padding"] = [ |
| 60 | + 0, |
| 61 | + 1, |
| 62 | + ] # toggle the insertion of padding in shared memory |
| 63 | + tune_params["read_only"] = [0, 1] # toggle using the read-only cache |
60 | 64 |
|
61 | 65 | grid_div_x = ["block_size_x", "tile_size_x"] |
62 | 66 | grid_div_y = ["block_size_y", "tile_size_y"] |
63 | 67 |
|
64 | | - #compute the answer using a naive kernel |
65 | | - params = { "block_size_x": 16, "block_size_y": 16} |
| 68 | + # compute the answer using a naive kernel |
| 69 | + params = {"block_size_x": 16, "block_size_y": 16} |
66 | 70 | tune_params["filter_width"] = [filter_size[0]] |
67 | 71 | tune_params["filter_height"] = [filter_size[1]] |
68 | | - results = kernel_tuner.run_kernel("convolution_naive", kernel_string, |
69 | | - problem_size, args, params, |
70 | | - grid_div_y=["block_size_y"], grid_div_x=["block_size_x"], lang='cupy') |
71 | | - |
72 | | - #set non-output fields to None |
| 72 | + results = kernel_tuner.run_kernel( |
| 73 | + "convolution_naive", |
| 74 | + kernel_string, |
| 75 | + problem_size, |
| 76 | + args, |
| 77 | + params, |
| 78 | + grid_div_y=["block_size_y"], |
| 79 | + grid_div_x=["block_size_x"], |
| 80 | + lang="cupy", |
| 81 | + ) |
| 82 | + |
| 83 | + # set non-output fields to None |
73 | 84 | answer = [results[0], None, None] |
74 | 85 |
|
75 | | - #start kernel tuning with correctness verification |
76 | | - return kernel_tuner.tune_kernel("convolution_kernel", kernel_string, |
77 | | - problem_size, args, tune_params, |
78 | | - grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer, lang='cupy') |
| 86 | + # start kernel tuning with correctness verification |
| 87 | + return kernel_tuner.tune_kernel( |
| 88 | + "convolution_kernel", |
| 89 | + kernel_string, |
| 90 | + problem_size, |
| 91 | + args, |
| 92 | + tune_params, |
| 93 | + grid_div_y=grid_div_y, |
| 94 | + grid_div_x=grid_div_x, |
| 95 | + verbose=True, |
| 96 | + cmem_args=cmem_args, |
| 97 | + answer=answer, |
| 98 | + lang="cupy", |
| 99 | + ) |
79 | 100 |
|
80 | 101 |
|
81 | 102 | if __name__ == "__main__": |
82 | 103 | import time |
83 | | - s1 = time.time()*1000 |
| 104 | + |
| 105 | + s1 = time.time() * 1000 |
84 | 106 | results = tune() |
85 | 107 |
|
86 | | - e1 = time.time()*1000 |
87 | | - print("\n Actual time used:", e1-s1) |
| 108 | + e1 = time.time() * 1000 |
| 109 | + print("\n Actual time used:", e1 - s1) |
88 | 110 | import json |
89 | | - with open("convolution_RTX_2070.json", 'w') as fp: |
90 | | - json.dump(results, fp) |
91 | 111 |
|
| 112 | + with open("convolution_RTX_2070.json", "w") as fp: |
| 113 | + json.dump(results, fp) |
0 commit comments