KernelTuner
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/source/conf.py‎
Lines changed: 2 additions & 2 deletions b/‎doc/source/conf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/optimization.rst‎
Lines changed: 15 additions & 5 deletions b/‎doc/source/optimization.rst‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎examples/c/matrix_multiply.cpp‎
Lines changed: 0 additions & 17 deletions b/‎examples/c/matrix_multiply.cpp‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎examples/c/matrix_multiply.py‎
Lines changed: 0 additions & 38 deletions b/‎examples/c/matrix_multiply.py‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎examples/cuda/python_kernel_cupy.py‎
Lines changed: 52 additions & 0 deletions b/‎examples/cuda/python_kernel_cupy.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎examples/cuda/reduction.cu‎
Lines changed: 1 addition & 1 deletion b/‎examples/cuda/reduction.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cuda/reduction.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/cuda/reduction.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernel_tuner/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎kernel_tuner/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -4,6 +4,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.4.4] - 2023-03-09
 ### Added
 - Support for using time_limit in simulation mode
 - Helper functions for energy tuning
@@ -12,6 +13,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ### Changed
 - Changed what timings are stored in cache files
+- No longer inserting partial loop unrolling factor of 0 in CUDA
 
 ## [0.4.3] - 2022-10-19
 ### Added
 
@@ -59,9 +59,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.4.3'
+version = u'0.4.4'
 # The full version, including alpha/beta/rc tags.
-release = u'0.4.3'
+release = u'0.4.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 
@@ -28,16 +28,26 @@ the ``strategy=`` optional argument of ``tune_kernel()``. Kernel Tuner currently
 
 Most strategies have some mechanism built in to detect when to stop tuning, which may be controlled through specific 
 parameters that can be passed to the strategies using the ``strategy_options=`` optional argument of ``tune_kernel()``. You 
-can also override whatever internal stop criterion the strategy uses, and set either a time limit in seconds or a maximum 
-number of unique function evaluations.
+can also override whatever internal stop criterion the strategy uses, and set either a time limit in seconds (using ``time_limit=``) or a maximum 
+number of unique function evaluations (using ``max_fevals=``).
+
+To give an example, one could simply add these two arguments to any code calling ``tune_kernel()``:
+
+.. code-block:: python
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params,
+                               strategy="random_sample",
+                               strategy_options=dict(max_fevals=5))
+
 
 A 'unique function evaluation' corresponds to the first time that Kernel Tuner tries to compile and benchmark a parameter 
 configuration that has been selected by the optimization strategy. If you are continuing from a previous tuning session using 
-cache files, serving a value from the cache also counts as a function evaluation for the strategy. Only unique function 
-evaluations are counted, so the second time a parameter configuration is selected by the strategy it is served from the 
+cache files, serving a value from the cache for the first time in the run also counts as a function evaluation for the strategy.
+Only unique function evaluations are counted, so the second time a parameter configuration is selected by the strategy it is served from the 
 cache, but not counted as a unique function evaluation.
 
-The ``strategy_options=`` argument of ``tune_kernel()`` should be used as follows:
+Below all the strategies are listed with their strategy-specific options that can be passed in a dictionary to the ``strategy_options=`` argument
+of ``tune_kernel()``.
 
 
 kernel_tuner.strategies.basinhopping
 
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+    This is the vector_add example modified to show
+    how to use PythonKernel with the CuPy backend
+"""
+
+import cupy as cp
+import numpy as np
+from kernel_tuner.kernelbuilder import PythonKernel
+
+def kernelbuilder_example():
+
+    # To make this example self-contained we include the kernel as a string
+    # here, but you can also just point to a file with the kernel code
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    # Setup the arguments for our vector add kernel
+    size = 100000
+    a = cp.random.randn(size).astype(np.float32)
+    b = cp.random.randn(size).astype(np.float32)
+    c = cp.zeros_like(b)
+    n = np.int32(size)
+
+    # Note that the type and order should match our GPU code
+    # Because the arguments are all CuPy arrays, our PythonKernel does not need to
+    # worry about moving data between host and device
+    args = [c, a, b, n]
+
+    # We can instantiate a specific kernel configurations
+    params = {"block_size_x": 128}
+
+    # Here we construct a Python object that represents the kernel
+    # we can use it to conveniently use the GPU kernel in Python
+    # applications that want to frequently call the GPU kernel
+    vector_add = PythonKernel("vector_add", kernel_string, size, args, params, lang="cupy")
+
+    # We can use the PythonKernel instance as a regular Python function
+    vector_add(c, a, b, n)
+
+    # Compare the result in c with a+b computed in Python
+    assert np.allclose(c, a+b)
+
+
+if __name__ == "__main__":
+    kernelbuilder_example()
@@ -54,7 +54,7 @@ __global__ void sum_floats(float *sum_global, floatvector *array, int n) {
         sum = sh_mem[ti];
         #pragma unroll
         for (unsigned int s=16; s>0; s>>=1) {
-            sum += __shfl_down_sync(0, sum, s);
+            sum += __shfl_down_sync(0xffffffff, sum, s);
         }
     }
     #else
 
@@ -11,10 +11,10 @@ def tune():
 
     tune_params = OrderedDict()
     tune_params["block_size_x"] = [2**i for i in range(5,11)]
-    tune_params["use_shuffle"] = [0, 1]
     tune_params["vector"] = [2**i for i in range(3)]
     tune_params["num_blocks"] = [2**i for i in range(5,16)]
     tune_params["loop_unroll_factor_0"] = [0, 1, 8, 16, 32, 64]
+    tune_params["use_shuffle"] = [0, 1]
 
     problem_size = "num_blocks"
     size = 800000000
 
@@ -1,4 +1,4 @@
 from kernel_tuner.integration import store_results, create_device_targets
 from kernel_tuner.interface import tune_kernel, run_kernel
 
-__version__ = "0.4.3"
+__version__ = "0.4.4"
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ __global__ void sum_floats(float sum_global, floatvector array, int n) {`
`54`	`54`	`sum = sh_mem[ti];`
`55`	`55`	`#pragma unroll`
`56`	`56`	`for (unsigned int s=16; s>0; s>>=1) {`
`57`		`- sum += __shfl_down_sync(0, sum, s);`
	`57`	`+ sum += __shfl_down_sync(0xffffffff, sum, s);`
`58`	`58`	`}`
`59`	`59`	`}`
`60`	`60`	`#else`