Skip to content

Commit 43152ae

Browse files
committed
Merge branch 'master' into directives
2 parents 16443b4 + 2451e2d commit 43152ae

File tree

2 files changed

+26
-12
lines changed

2 files changed

+26
-12
lines changed

kernel_tuner/backends/pycuda.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def _finish_up():
106106
)
107107
if cc == "00":
108108
cc = self.context.get_device().compute_capability()
109-
self.cc = str(cc[0]) + str(cc[1])
109+
self.cc = str(cc)
110110
self.iterations = iterations
111111
self.current_module = None
112112
self.func = None
@@ -180,6 +180,9 @@ def ready_argument_list(self, arguments):
180180
# pycuda does not support bool, convert to uint8 instead
181181
elif isinstance(arg, np.bool_):
182182
gpu_args.append(arg.astype(np.uint8))
183+
# pycuda does not support 16-bit formats, view them as uint16
184+
elif isinstance(arg, np.generic) and str(arg.dtype) in ("float16", "bfloat16"):
185+
gpu_args.append(arg.view(np.uint16))
183186
# if not an array, just pass argument along
184187
else:
185188
gpu_args.append(arg)

kernel_tuner/core.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_n
118118
The files beyond the first are considered additional files that may also contain tunable parameters
119119
120120
For each file beyond the first this function creates a temporary file with
121-
preprocessors statements inserted. Occurences of the original filenames in the
121+
preprocessors statements inserted. Occurrences of the original filenames in the
122122
first file are replaced with their temporary counterparts.
123123
124124
:param kernel_name: A string specifying the kernel name.
@@ -174,7 +174,7 @@ def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_n
174174
temp_file = util.get_temp_filename(suffix="." + f.split(".")[-1])
175175
temp_files[f] = temp_file
176176
util.write_file(temp_file, ks)
177-
# replace occurences of the additional file's name in the first kernel_string with the name of the temp file
177+
# replace occurrences of the additional file's name in the first kernel_string with the name of the temp file
178178
kernel_string = kernel_string.replace(f, temp_file)
179179

180180
return name, kernel_string, temp_files
@@ -574,7 +574,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
574574
if kernel_options.texmem_args is not None:
575575
self.dev.copy_texture_memory_args(kernel_options.texmem_args)
576576

577-
# stop compilation stopwatch and convert to miliseconds
577+
# stop compilation stopwatch and convert to milliseconds
578578
last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
579579

580580
# test kernel for correctness
@@ -600,7 +600,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
600600
print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
601601
raise e
602602

603-
# clean up any temporary files, if no error occured
603+
# clean up any temporary files, if no error occurred
604604
instance.delete_temp_files()
605605

606606
result["compile_time"] = last_compilation_time or 0
@@ -776,10 +776,12 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
776776
# for each element in the argument list, check if the types match
777777
for i, arg in enumerate(instance.arguments):
778778
if answer[i] is not None: # skip None elements in the answer list
779-
if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(arg, (np.ndarray, cp.ndarray)):
780-
if answer[i].dtype != arg.dtype:
779+
if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(
780+
arg, (np.ndarray, cp.ndarray)
781+
):
782+
if not np.can_cast(arg.dtype, answer[i].dtype):
781783
raise TypeError(
782-
f"Element {i} of the expected results list is not of the same dtype as the kernel output: "
784+
f"Element {i} of the expected results list has a dtype that is not compatible with the dtype of the kernel output: "
783785
+ str(answer[i].dtype)
784786
+ " != "
785787
+ str(arg.dtype)
@@ -857,9 +859,18 @@ def _flatten(a):
857859
output_test = np.allclose(expected, result, atol=atol)
858860

859861
if not output_test and verbose:
860-
print("Error: " + util.get_config_string(instance.params) + " detected during correctness check")
861-
print("this error occured when checking value of the %oth kernel argument" % (i,))
862-
print("Printing kernel output and expected result, set verbose=False to suppress this debug print")
862+
print(
863+
"Error: "
864+
+ util.get_config_string(instance.params)
865+
+ " detected during correctness check"
866+
)
867+
print(
868+
"this error occurred when checking value of the %oth kernel argument"
869+
% (i,)
870+
)
871+
print(
872+
"Printing kernel output and expected result, set verbose=False to suppress this debug print"
873+
)
863874
np.set_printoptions(edgeitems=50)
864875
print("Kernel output:")
865876
print(result)
@@ -901,7 +912,7 @@ def replace_typename_token(matchobj):
901912
# if the templated typename occurs as a token in the string, meaning that it is enclosed in
902913
# beginning of string or whitespace, and end of string, whitespace or star
903914
regex = r"(^|\s+)(" + k + r")($|\s+|\*)"
904-
sub = re.sub(regex, replace_typename_token, arg_type, re.S)
915+
sub = re.sub(regex, replace_typename_token, arg_type, flags=re.S)
905916
type_list[i] = sub
906917

907918

0 commit comments

Comments
 (0)