I like to compare CUDA and CPU results. NVIDIA's nvvm defaults to fma=1 behind the scene. This I can manage using PyCUDA but not NUMBA cuda.jit. The following exposes the fma option. Note: there is a blank line at the end of the diff. Oh how happy seeing the cuda.jit results matching the CPU.
Disabling fma allows me to witness cuda.jit match the CPU results. E.g. Launch mandel_kernel.py and press the letter x. That will auto zoom to location 2. The RGB values total matches the CPU.
diff --git a/app/mandel_common.py b/app/mandel_common.py
index e9055f5..2c7fc47 100644
--- a/app/mandel_common.py
+++ b/app/mandel_common.py
@@ -124,7 +124,7 @@ def _mandel1(colors, creal, cimag, max_iters):
return INSIDE_COLOR1
mandel1 = \
- cuda.jit(device=True)(_mandel1) if USE_CUDA else \
+ cuda.jit(device=True, fma=False)(_mandel1) if USE_CUDA else \
njit('UniTuple(u1,3)(i2[:,:], f8, f8, i4)', nogil=True)(_mandel1)
diff --git a/app/mandel_kernel.py b/app/mandel_kernel.py
index 056e57a..7037e6f 100644
--- a/app/mandel_kernel.py
+++ b/app/mandel_kernel.py
@@ -17,7 +17,7 @@ from numba import cuda, float32, uint8, int16, int32
ESCAPE_RADIUS_2 = RADIUS * RADIUS
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)', fma=False)
def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters):
y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -31,7 +31,7 @@ def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
temp[y,x] = mandel1(colors, creal, cimag, max_iters)
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])', fma=False)
def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters, aafactor, offset, output):
y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -131,7 +131,7 @@ def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
output[y,x] = (uint8(r/aaarea), uint8(g/aaarea), uint8(b/aaarea))
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
def horizontal_gaussian_blur(matrix, src, dst, width, height):
y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -169,7 +169,7 @@ def horizontal_gaussian_blur(matrix, src, dst, width, height):
dst[y,x] = (uint8(r), uint8(g), uint8(b))
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
def vertical_gaussian_blur(matrix, src, dst, width, height):
y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -207,7 +207,7 @@ def vertical_gaussian_blur(matrix, src, dst, width, height):
dst[y,x] = (uint8(r), uint8(g), uint8(b))
-@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
def unsharp_mask(src, dst, width, height):
"""
Sharpen the destination image using the Unsharp Mask technique.
I like to compare CUDA and CPU results. NVIDIA's nvvm defaults to fma=1 behind the scene. This I can manage using PyCUDA but not NUMBA cuda.jit. The following exposes the fma option. Note: there is a blank line at the end of the diff. Oh how happy seeing the cuda.jit results matching the CPU.
Exposing the fma option:
Use case: https://github.com/marioroy/mandelbrot-python
Disabling fma allows me to witness cuda.jit match the CPU results. E.g. Launch
mandel_kernel.pyand press the letterx. That will auto zoom to location 2. The RGB values total matches the CPU.