Add cblas_sdot and cblas_ddot to our CBLAS workaround list

staticfloat · staticfloat · commit b219b961e5a6 · 2022-03-08T15:36:25.000-08:00
Julia recently started using these, so we need to workaround them in
LBT as well, for CBLAS-divergent BLAS libraries such as MKL v2022.
diff --git a/README.md b/README.md
@@ -60,6 +60,8 @@ You can always tell if your system is limited in this fashion by calling `lbt_ge
 
 ### Version History
 
+v5.0.2 - Add `cblas_sdot` and `cblas_ddot` to CBLAS divergence workaround wrappers.
+
 v5.0.1 - Fix complex return wrapper infinite loop bug.
 
 v5.0.0 - Add complex return value wrappers and CBLAS workaround.  The complex return value wrapper ensures that all symbols maintain a standard ABI for returning complex numbers, and the CBLAS workaround maps CBLAS symbols to FORTRAN symbols when properly-suffixed CBLAS symbols do not exist, as is the case in MKL `v2022.0`.
diff --git a/ext/gensymbol/generate_func_list.sh b/ext/gensymbol/generate_func_list.sh
@@ -107,20 +107,25 @@ echo "#endif" >> "${OUTPUT_FILE}"
 NUM_COMPLEX128_SYMBOLS="${NUM_SYMBOLS}"
 
 NUM_SYMBOLS=0
-CBLAS_SUB_FUNCS="$(grep -e '^cblas_.*_sub' <<< "${EXPORTED_FUNCS}")"
+# We manually curate a list of cblas functions that we have defined adapters for
+# in `src/cblas_adapters.c`.  This is our compromise between the crushing workload
+# of manually defining every single CBLAS function we need, and the practical need
+# to get Julia to pass its LinearAlgebra tests using MKL v2022.
+CBLAS_WORKAROUND_FUNCS="$(grep -e '^cblas_.*_sub$' <<< "${EXPORTED_FUNCS}")"
+CBLAS_WORKAROUND_FUNCS="${CBLAS_WORKAROUND_FUNCS} $(grep -e '^cblas_.dot$' <<< "${EXPORTED_FUNCS}")"
 echo >> "${OUTPUT_FILE}"
-echo "#ifndef CBLAS_SUB_FUNCS" >> "${OUTPUT_FILE}"
-echo "#define CBLAS_SUB_FUNCS(XX) \\" >> "${OUTPUT_FILE}"
-for func_name in ${CBLAS_SUB_FUNCS}; do
+echo "#ifndef CBLAS_WORKAROUND_FUNCS" >> "${OUTPUT_FILE}"
+echo "#define CBLAS_WORKAROUND_FUNCS(XX) \\" >> "${OUTPUT_FILE}"
+for func_name in ${CBLAS_WORKAROUND_FUNCS}; do
     output_func "${func_name}"
 done
 echo >> "${OUTPUT_FILE}"
 echo "#endif" >> "${OUTPUT_FILE}"
-NUM_CBLAS_SUB_SYMBOLS="${NUM_SYMBOLS}"
+NUM_CBLAS_WORKAROUND_SYMBOLS="${NUM_SYMBOLS}"
 
 # Report to the user and cleanup
 echo
 NUM_F2C_SYMBOLS="$((NUM_FLOAT32_SYMBOLS + NUM_COMPLEX64_SYMBOLS + NUM_COMPLEX128_SYMBOLS))"
 NUM_CMPLX_SYMBOLS="$((NUM_COMPLEX64_SYMBOLS + NUM_COMPLEX128_SYMBOLS))"
-echo "Done, with ${NUM_EXPORTED} symbols generated (${NUM_F2C_SYMBOLS} f2c, ${NUM_CMPLX_SYMBOLS} complex-returning, ${NUM_CBLAS_SUB_SYMBOLS} cblas-sub functions)."
+echo "Done, with ${NUM_EXPORTED} symbols generated (${NUM_F2C_SYMBOLS} f2c, ${NUM_CMPLX_SYMBOLS} complex-returning, ${NUM_CBLAS_WORKAROUND_SYMBOLS} cblas-workaround functions)."
 rm -f tempsymbols.def
diff --git a/src/Make.inc b/src/Make.inc
@@ -24,7 +24,7 @@ endif
 
 LBT_SOVERSION_MAJOR := 5
 LBT_SOVERSION_MINOR := 0
-LBT_SOVERSION_PATCH := 1
+LBT_SOVERSION_PATCH := 2
 
 ifeq ($(OS), WINNT)
   SHLIB_EXT := dll
diff --git a/src/cblas_adapters.c b/src/cblas_adapters.c
@@ -105,3 +105,45 @@ void lbt_cblas_cdotu_sub64_(const int64_t N,
 {
    *z = cdotu_64_(&N, X, &incX, Y, &incY);
 }
+
+
+
+extern float sdot_(const int32_t *,
+                   const float *, const int32_t *,
+                   const float *, const int32_t *);
+float lbt_cblas_sdot(const int32_t N,
+                     const float *X, const int32_t incX,
+                     const float *Y, const int32_t incY)
+{
+   return sdot_(&N, X, &incX, Y, &incY);
+}
+
+extern float sdot_64_(const int64_t *,
+                      const float  *, const int64_t *,
+                      const float  *, const int64_t *);
+float lbt_cblas_sdot64_(const int64_t N,
+                        const float  *X, const int64_t incX,
+                        const float  *Y, const int64_t incY)
+{
+   return sdot_64_(&N, X, &incX, Y, &incY);
+}
+
+extern double ddot_(const int32_t *,
+                    const double *, const int32_t *,
+                    const double *, const int32_t *);
+double lbt_cblas_ddot(const int32_t N,
+                      const double *X, const int32_t incX,
+                      const double *Y, const int32_t incY)
+{
+   return ddot_(&N, X, &incX, Y, &incY);
+}
+
+extern double ddot_64_(const int64_t *,
+                       const double  *, const int64_t *,
+                       const double  *, const int64_t *);
+double lbt_cblas_ddot64_(const int64_t N,
+                         const double  *X, const int64_t incX,
+                         const double  *Y, const int64_t incY)
+{
+   return ddot_64_(&N, X, &incX, Y, &incY);
+}
diff --git a/src/exported_funcs.inc b/src/exported_funcs.inc
@@ -4986,11 +4986,13 @@
 
 #endif
 
-#ifndef CBLAS_SUB_FUNCS
-#define CBLAS_SUB_FUNCS(XX) \
+#ifndef CBLAS_WORKAROUND_FUNCS
+#define CBLAS_WORKAROUND_FUNCS(XX) \
     XX(cblas_cdotc_sub, 2547) \
     XX(cblas_cdotu_sub, 2549) \
     XX(cblas_zdotc_sub, 2695) \
     XX(cblas_zdotu_sub, 2697) \
+    XX(cblas_ddot, 2588) \
+    XX(cblas_sdot, 2655) \
 
 #endif
diff --git a/src/libblastrampoline_cblasdata.h b/src/libblastrampoline_cblasdata.h
@@ -1,7 +1,7 @@
 #define XX(name, index)     extern const void * lbt_##name ;
 #define XX_64(name, index)  extern const void * lbt_##name##64_ ;
-CBLAS_SUB_FUNCS(XX)
-CBLAS_SUB_FUNCS(XX_64)
+CBLAS_WORKAROUND_FUNCS(XX)
+CBLAS_WORKAROUND_FUNCS(XX_64)
 #undef XX
 #undef XX_64
 
@@ -11,11 +11,11 @@ CBLAS_SUB_FUNCS(XX_64)
 #define XX(name, index)    &lbt_##name,
 #define XX_64(name, index) &lbt_##name##64_,
 const void ** cblas32_func_wrappers[] = {
-    CBLAS_SUB_FUNCS(XX)
+    CBLAS_WORKAROUND_FUNCS(XX)
     NULL
 };
 const void ** cblas64_func_wrappers[] = {
-    CBLAS_SUB_FUNCS(XX_64)
+    CBLAS_WORKAROUND_FUNCS(XX_64)
     NULL
 };
 #undef XX
@@ -24,7 +24,7 @@ const void ** cblas64_func_wrappers[] = {
 // Finally, an array that maps cblas index -> exported symbol index
 #define XX(name, index)    index,
 const int cblas_func_idxs[] = {
-    CBLAS_SUB_FUNCS(XX)
+    CBLAS_WORKAROUND_FUNCS(XX)
     -1
 };
-#undef XX
+#undef XX
diff --git a/test/direct.jl b/test/direct.jl
@@ -259,5 +259,14 @@ if MKL_jll.is_available() && Sys.ARCH == :x86_64
         ccall(zdotu_fptr, Cvoid, (Int64, Ptr{ComplexF64}, Int64, Ptr{ComplexF64}, Int64, Ptr{ComplexF64}), 2, A, 1, B, 1, result)
         @test result[1] ≈ ComplexF64(1.47 + 3.83im)
         @test isempty(stacktraces)
+
+        # Also call `sdot_`, asserting the same.
+        empty!(stacktraces)
+        A = Float32[3.1, -1.0]
+        B = Float32[1.3, -1.1]
+        sdot_fptr = dlsym(lbt_handle, :cblas_sdot64_)
+        result = ccall(sdot_fptr, Cfloat, (Int64, Ptr{Float32}, Int64, Ptr{Float32}, Int64), 2, A, 1, B, 1)
+        @test result ≈ Float32(5.13)
+        @test isempty(stacktraces)
     end
 end