Skip to content

Commit 9739495

Browse files
committed
Reformat with black.
1 parent ceb0996 commit 9739495

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+826
-728
lines changed

kernel_tuner/accuracy.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,7 @@ def select_for_configuration(self, params):
4646

4747
if option not in self.data:
4848
list = ", ".join(map(str, self.data.keys()))
49-
raise KeyError(
50-
f"'{option}' is not a valid parameter value, should be one of: {list}"
51-
)
49+
raise KeyError(f"'{option}' is not a valid parameter value, should be one of: {list}")
5250

5351
return self.data[option]
5452

@@ -60,12 +58,14 @@ def _find_bfloat16_if_available():
6058
# Try to get bfloat16 if available.
6159
try:
6260
from bfloat16 import bfloat16
61+
6362
return bfloat16
6463
except ImportError:
6564
pass
6665

6766
try:
6867
from tensorflow import bfloat16
68+
6969
return bfloat16.as_numpy_dtype
7070
except ImportError:
7171
pass
@@ -102,9 +102,7 @@ def _to_float_dtype(x: str) -> np.dtype:
102102

103103

104104
class TunablePrecision(Tunable):
105-
def __init__(
106-
self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
107-
):
105+
def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
108106
"""The ``Tunable`` object can be used as an input argument when tuning
109107
kernels. It is a container that internally holds several arrays
110108
containing the same data, but stored in using different levels of
@@ -135,7 +133,6 @@ def __init__(
135133
if bfloat16 is not None:
136134
dtypes["bfloat16"] = bfloat16
137135

138-
139136
# If dtype is a list, convert it to a dictionary
140137
if isinstance(dtypes, (list, tuple)):
141138
dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
@@ -257,9 +254,7 @@ def metric(a, b):
257254
raise ValueError(f"invalid error metric provided: {user_key}")
258255

259256
# cast both arguments to f64 before passing them to the metric
260-
return lambda a, b: metric(
261-
a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
262-
)
257+
return lambda a, b: metric(a.astype(np.float64, copy=False), b.astype(np.float64, copy=False))
263258

264259

265260
class AccuracyObserver(OutputObserver):

kernel_tuner/backends/compiler.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
try:
3535
from hip._util.types import DeviceArray
3636
except ImportError:
37-
Pointer = Exception # using Exception here as a type that will never be among kernel arguments
37+
Pointer = Exception # using Exception here as a type that will never be among kernel arguments
3838
DeviceArray = Exception
3939

4040

@@ -157,7 +157,9 @@ def ready_argument_list(self, arguments):
157157

158158
for i, arg in enumerate(arguments):
159159
if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
160-
raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
160+
raise TypeError(
161+
f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
162+
)
161163
dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
162164
if isinstance(arg, np.ndarray):
163165
if dtype_str in dtype_map.keys():
@@ -288,7 +290,7 @@ def compile(self, kernel_instance):
288290
stdout=subprocess.PIPE,
289291
stderr=subprocess.PIPE,
290292
text=True,
291-
check=True
293+
check=True,
292294
)
293295

294296
subprocess.run(
@@ -299,7 +301,7 @@ def compile(self, kernel_instance):
299301
stdout=subprocess.PIPE,
300302
stderr=subprocess.PIPE,
301303
text=True,
302-
check=True
304+
check=True,
303305
)
304306

305307
self.lib = np.ctypeslib.load_library(filename, ".")
@@ -439,7 +441,7 @@ def cleanup_lib(self):
439441
"""unload the previously loaded shared library"""
440442
if self.lib is None:
441443
return
442-
444+
443445
if not self.using_openmp and not self.using_openacc:
444446
# this if statement is necessary because shared libraries that use
445447
# OpenMP will core dump when unloaded, this is a well-known issue with OpenMP

kernel_tuner/backends/cupy.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
7070
# collect environment information
7171
env = dict()
7272
cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
73-
info_dict = {
74-
s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
75-
}
73+
info_dict = {s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info}
7674
env["device_name"] = info_dict[f"Device {device} Name"]
7775

7876
env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
@@ -129,9 +127,7 @@ def compile(self, kernel_instance):
129127

130128
options = tuple(compiler_options)
131129

132-
self.current_module = cp.RawModule(
133-
code=kernel_string, options=options, name_expressions=[kernel_name]
134-
)
130+
self.current_module = cp.RawModule(code=kernel_string, options=options, name_expressions=[kernel_name])
135131

136132
self.func = self.current_module.get_function(kernel_name)
137133
self.num_regs = self.func.num_regs

kernel_tuner/backends/nvcuda.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
5656
CudaFunctions.last_selected_device = device
5757

5858
# compute capabilities and device properties
59-
err, major = cudart.cudaDeviceGetAttribute(
60-
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
61-
)
59+
err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
6260
cuda_error_check(err)
63-
err, minor = cudart.cudaDeviceGetAttribute(
64-
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
65-
)
61+
err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
6662
cuda_error_check(err)
6763
err, self.max_threads = cudart.cudaDeviceGetAttribute(
6864
cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +160,14 @@ def compile(self, kernel_instance):
164160
if not any(["--std=" in opt for opt in self.compiler_options]):
165161
self.compiler_options.append("--std=c++11")
166162
if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
167-
compiler_options.append(
168-
f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
169-
)
163+
compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
170164
if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
171165
self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
172166

173-
err, program = nvrtc.nvrtcCreateProgram(
174-
str.encode(kernel_string), b"CUDAProgram", 0, [], []
175-
)
167+
err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
176168
try:
177169
cuda_error_check(err)
178-
err = nvrtc.nvrtcCompileProgram(
179-
program, len(compiler_options), compiler_options
180-
)
170+
err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
181171
cuda_error_check(err)
182172
err, size = nvrtc.nvrtcGetPTXSize(program)
183173
cuda_error_check(err)
@@ -189,9 +179,7 @@ def compile(self, kernel_instance):
189179
raise SkippableFailure("uses too much shared data")
190180
else:
191181
cuda_error_check(err)
192-
err, self.func = cuda.cuModuleGetFunction(
193-
self.current_module, str.encode(kernel_name)
194-
)
182+
err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
195183
cuda_error_check(err)
196184

197185
# get the number of registers per thread used in this kernel

kernel_tuner/backends/opencl.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
class OpenCLFunctions(GPUBackend):
1717
"""Class that groups the OpenCL functions on maintains some state about the device."""
1818

19-
def __init__(
20-
self, device=0, platform=0, iterations=7, compiler_options=None, observers=None
21-
):
19+
def __init__(self, device=0, platform=0, iterations=7, compiler_options=None, observers=None):
2220
"""Creates OpenCL device context and reads device properties.
2321
2422
:param device: The ID of the OpenCL device to use for benchmarking
@@ -37,14 +35,10 @@ def __init__(
3735
platforms = cl.get_platforms()
3836
self.ctx = cl.Context(devices=[platforms[platform].get_devices()[device]])
3937

40-
self.queue = cl.CommandQueue(
41-
self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
42-
)
38+
self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
4339
self.mf = cl.mem_flags
4440
# inspect device properties
45-
self.max_threads = self.ctx.devices[0].get_info(
46-
cl.device_info.MAX_WORK_GROUP_SIZE
47-
)
41+
self.max_threads = self.ctx.devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
4842
self.compiler_options = compiler_options or []
4943

5044
# observer stuff
@@ -108,9 +102,7 @@ def compile(self, kernel_instance):
108102
:returns: An OpenCL kernel that can be called directly.
109103
:rtype: pyopencl.Kernel
110104
"""
111-
prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(
112-
options=self.compiler_options
113-
)
105+
prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(options=self.compiler_options)
114106
func = getattr(prg, kernel_instance.name)
115107
return func
116108

kernel_tuner/backends/pycuda.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,9 @@ def _finish_up():
9797
PyCudaFunctions.last_selected_context = self.context
9898

9999
# inspect device properties
100-
devprops = {
101-
str(k): v for (k, v) in self.context.get_device().get_attributes().items()
102-
}
100+
devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
103101
self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
104-
cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
105-
devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
106-
)
102+
cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(devprops.get("COMPUTE_CAPABILITY_MINOR", "0"))
107103
if cc == "00":
108104
cc = self.context.get_device().compute_capability()
109105
self.cc = str(cc[0]) + str(cc[1])
@@ -347,14 +343,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
347343
"""
348344
if stream is None:
349345
stream = self.stream
350-
func(
351-
*gpu_args,
352-
block=threads,
353-
grid=grid,
354-
stream=stream,
355-
shared=self.smem_size,
356-
texrefs=self.texrefs
357-
)
346+
func(*gpu_args, block=threads, grid=grid, stream=stream, shared=self.smem_size, texrefs=self.texrefs)
358347

359348
def memset(self, allocation, value, size):
360349
"""Set the memory in allocation to the value in value.

0 commit comments

Comments
 (0)