Skip to content

[FEA] NVIDIA nvvm defaults to fma=1 and wish there was a way to tell it fma=0 #118

@marioroy

Description

@marioroy

I like to compare CUDA and CPU results. NVIDIA's nvvm defaults to fma=1 behind the scene. This I can manage using PyCUDA but not NUMBA cuda.jit. The following exposes the fma option. Note: there is a blank line at the end of the diff. Oh how happy seeing the cuda.jit results matching the CPU.

Exposing the fma option:

diff -uarp a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -63,7 +63,7 @@ class _Kernel(serialize.ReduceMixin):
     ]

     @global_compiler_lock
-    def __init__(self, py_func, argtypes, link=None, debug=False,
+    def __init__(self, py_func, argtypes, link=None, debug=False, fma=True,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
                  max_registers=None, lto=False, opt=True, device=False):

@@ -95,6 +95,7 @@ class _Kernel(serialize.ReduceMixin):

         nvvm_options = {
             'fastmath': fastmath,
+            'fma': fma,
             'opt': 3 if opt else 0
         }

diff -uarp a/numba_cuda/numba/cuda/cudadrv/nvvm.py b/numba_cuda/numba/cuda/cudadrv/nvvm.py
--- a/numba_cuda/numba/cuda/cudadrv/nvvm.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvvm.py
@@ -632,6 +632,9 @@ def compile_ir(llvmir, **opts):
             'prec_sqrt': False,
         })
 
+    if not opts.pop('fma', True):
+        opts.update({ 'fma': False })
+
     cu = CompilationUnit()
     libdevice = LibDevice()
 

Use case: https://github.com/marioroy/mandelbrot-python

Disabling fma allows me to witness cuda.jit match the CPU results. E.g. Launch mandel_kernel.py and press the letter x. That will auto zoom to location 2. The RGB values total matches the CPU.

diff --git a/app/mandel_common.py b/app/mandel_common.py
index e9055f5..2c7fc47 100644
--- a/app/mandel_common.py
+++ b/app/mandel_common.py
@@ -124,7 +124,7 @@ def _mandel1(colors, creal, cimag, max_iters):
     return INSIDE_COLOR1
 
 mandel1 = \
-    cuda.jit(device=True)(_mandel1) if USE_CUDA else \
+    cuda.jit(device=True, fma=False)(_mandel1) if USE_CUDA else \
     njit('UniTuple(u1,3)(i2[:,:], f8, f8, i4)', nogil=True)(_mandel1)
 
 
diff --git a/app/mandel_kernel.py b/app/mandel_kernel.py
index 056e57a..7037e6f 100644
--- a/app/mandel_kernel.py
+++ b/app/mandel_kernel.py
@@ -17,7 +17,7 @@ from numba import cuda, float32, uint8, int16, int32
 
 ESCAPE_RADIUS_2 = RADIUS * RADIUS
 
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)', fma=False)
 def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -31,7 +31,7 @@ def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
     temp[y,x] = mandel1(colors, creal, cimag, max_iters)
 
 
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])', fma=False)
 def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters, aafactor, offset, output):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -131,7 +131,7 @@ def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
     output[y,x] = (uint8(r/aaarea), uint8(g/aaarea), uint8(b/aaarea))
 
 
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def horizontal_gaussian_blur(matrix, src, dst, width, height):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -169,7 +169,7 @@ def horizontal_gaussian_blur(matrix, src, dst, width, height):
     dst[y,x] = (uint8(r), uint8(g), uint8(b))
 
 
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def vertical_gaussian_blur(matrix, src, dst, width, height):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -207,7 +207,7 @@ def vertical_gaussian_blur(matrix, src, dst, width, height):
     dst[y,x] = (uint8(r), uint8(g), uint8(b))
 
 
-@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def unsharp_mask(src, dst, width, height):
     """
     Sharpen the destination image using the Unsharp Mask technique.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions