Merge pull request #263 from Hardcode84/f64-fixes

Diptorup Deb · web-flow · commit 0377c9e886d0 · 2023-05-16T14:26:19.000-05:00
numba-mlir f32 devices support
diff --git a/README.md b/README.md
@@ -98,11 +98,11 @@ SPDX-License-Identifier: Apache-2.0
 
 
         ```bash
-        $  dpbench -b black_scholes -i numba_mlir_k s run
+        $  dpbench -b black_scholes -i numba_mlir_k run
         ```
 
     to run all `numba-mlir` benchmarks:
 
         ```bash
-        $  dpbench -b black_scholes -i numba_mlir_k,numba_mlir_n,numba_mlir_p s run
+        $  dpbench -b black_scholes -i numba_mlir_k,numba_mlir_n,numba_mlir_p run
         ```
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_k.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_k.py
@@ -7,7 +7,7 @@
 import numba_mlir.kernel as nb
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put):
     mr = -rate
     sig_sig_two = volatility * volatility * 2
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_n.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_n.py
@@ -13,7 +13,7 @@ def _nberf(x):
     return erf(x)
 
 
-@nb.njit(parallel=True, fastmath=True)
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def _black_scholes(price, strike, t, rate, volatility, call, put):
     mr = -rate
     sig_sig_two = volatility * volatility * 2
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_p.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_mlir_p.py
@@ -9,7 +9,7 @@
 
 
 # blackscholes implemented as a parallel loop using numba.prange
-@nb.njit(parallel=True, fastmath=True)
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def _black_scholes(nopt, price, strike, t, rate, volatility, call, put):
     mr = -rate
     sig_sig_two = volatility * volatility * 2
diff --git a/dpbench/benchmarks/kmeans/kmeans_numba_mlir_k.py b/dpbench/benchmarks/kmeans/kmeans_numba_mlir_k.py
@@ -9,7 +9,7 @@
 atomic_add = nb.atomic.add
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
     idx = nb.get_global_id(0)
     # if idx < num_points: # why it was removed??
@@ -23,15 +23,15 @@ def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
             arrayPcluster[idx] = i
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def calCentroidsSum1(arrayCsum, arrayCnumpoint):
     i = nb.get_global_id(0)
     arrayCsum[i, 0] = 0
     arrayCsum[i, 1] = 0
     arrayCnumpoint[i] = 0
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
     i = nb.get_global_id(0)
     ci = arrayPcluster[i]
@@ -40,14 +40,14 @@ def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
     atomic_add(arrayCnumpoint, ci, 1)
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def updateCentroids(arrayC, arrayCsum, arrayCnumpoint, num_centroids):
     i = nb.get_global_id(0)
     arrayC[i, 0] = arrayCsum[i, 0] / arrayCnumpoint[i]
     arrayC[i, 1] = arrayCsum[i, 1] / arrayCnumpoint[i]
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def copy_arrayC(arrayC, arrayP):
     i = nb.get_global_id(0)
     arrayC[i, 0] = arrayP[i, 0]
diff --git a/dpbench/benchmarks/kmeans/kmeans_numba_mlir_p.py b/dpbench/benchmarks/kmeans/kmeans_numba_mlir_p.py
@@ -11,7 +11,7 @@
 
 
 # determine the euclidean distance from the cluster center to each point
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
     # parallel for loop
     for i0 in numba.prange(num_points):
@@ -27,7 +27,7 @@ def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
 
 
 # assign points to cluster
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def calCentroidsSum(
     arrayP, arrayPcluster, arrayCsum, arrayCnumpoint, num_points, num_centroids
 ):
@@ -38,7 +38,7 @@ def calCentroidsSum(
         arrayCnumpoint[i] = 0
 
 
-@nbk.kernel
+@nbk.kernel(gpu_fp64_truncate="auto")
 def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
     i = nbk.get_global_id(0)
     ci = arrayPcluster[i]
@@ -48,14 +48,14 @@ def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
 
 
 # update the centriods array after computation
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def updateCentroids(arrayC, arrayCsum, arrayCnumpoint, num_centroids):
     for i in numba.prange(num_centroids):
         arrayC[i, 0] = arrayCsum[i, 0] / arrayCnumpoint[i]
         arrayC[i, 1] = arrayCsum[i, 1] / arrayCnumpoint[i]
 
 
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def copy_arrayC(arrayC, arrayP, num_centroids):
     for i in numba.prange(num_centroids):
         arrayC[i, 0] = arrayP[i, 0]
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def _pairwise_distance_kernel(X1, X2, D):
     i = nb.get_global_id(0)
 
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-@nb.njit(parallel=True, fastmath=True)
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def _pairwise_distance(X1, X2, D):
     x1 = np.sum(np.square(X1), axis=1)
     x2 = np.sum(np.square(X2), axis=1)
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 
-@nb.njit(parallel=True, fastmath=True)
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def _pairwise_distance(X1, X2, D):
     """Naïve pairwise distance impl - take an array representing M points in N
     dimensions, and return the M x M matrix of Euclidean distances
diff --git a/dpbench/benchmarks/rambo/rambo_numba_mlir_k.py b/dpbench/benchmarks/rambo/rambo_numba_mlir_k.py
@@ -7,7 +7,7 @@
 import numba_mlir.kernel as nb
 
 
-@nb.kernel
+@nb.kernel(gpu_fp64_truncate="auto")
 def _rambo(C1, F1, Q1, nout, output):
     i = nb.get_global_id(0)
     for j in range(nout):
diff --git a/dpbench/benchmarks/rambo/rambo_numba_mlir_n.py b/dpbench/benchmarks/rambo/rambo_numba_mlir_n.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def rambo(nevts, nout, C1, F1, Q1, output):
     C = 2.0 * C1 - 1.0
     S = np.sqrt(1 - np.square(C))
diff --git a/dpbench/benchmarks/rambo/rambo_numba_mlir_p.py b/dpbench/benchmarks/rambo/rambo_numba_mlir_p.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 
-@nb.njit
+@nb.njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def rambo(nevts, nout, C1, F1, Q1, output):
     for i in numba.prange(nevts):
         for j in numba.prange(nout):