OpenMathLib
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎Changelog.txt‎
Lines changed: 134 additions & 0 deletions b/‎Changelog.txt‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎Makefile.arm64‎
Lines changed: 10 additions & 0 deletions b/‎Makefile.arm64‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Makefile.power‎
Lines changed: 6 additions & 6 deletions b/‎Makefile.power‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎Makefile.prebuild‎
Lines changed: 2 additions & 2 deletions b/‎Makefile.prebuild‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile.riscv64‎
Lines changed: 4 additions & 4 deletions b/‎Makefile.riscv64‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎Makefile.rule‎
Lines changed: 3 additions & 1 deletion b/‎Makefile.rule‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎Makefile.system‎
Lines changed: 15 additions & 0 deletions b/‎Makefile.system‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎Makefile.tail‎
Lines changed: 5 additions & 2 deletions b/‎Makefile.tail‎
Lines changed: 5 additions & 2 deletions
@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 29.dev)
+set(OpenBLAS_PATCH_VERSION 30.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
@@ -152,6 +152,9 @@ endif ()
 if (NOT DEFINED BUILD_BFLOAT16)
  set (BUILD_BFLOAT16 false)
 endif ()
+if (NOT DEFINED BUILD_HFLOAT16)
+ set (BUILD_HFLOAT16 false)
+endif ()
 # set which float types we want to build for
 if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
   # if none are defined, build for all
@@ -305,8 +308,8 @@ if (USE_OPENMP)
   endif()
 endif()
 
-# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
-if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
+# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs 
+if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
   # Use response files
   set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   # Always build static library first
@@ -541,13 +544,13 @@ message(STATUS "adding postbuild instruction to rename syms")
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
 	  COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
-    COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
+    COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   else()
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
     COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
-    COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
+    COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   endif()
 
@@ -1,4 +1,138 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.30
+19-Jun-2025
+
+general:
+ - fixed an installation problem with the thread safety test in gmake builds
+ - fixed spurious overwriting of an input array in complex GEMMT/GEMMTR
+ - fixed naming of GEMMTR in error messages from XERBLA
+ - fixed compilation of SBGEMMT/SBGEMMTR in CMake builds
+ - fixed the implementation of ?NRM2 to handle INCX=0 correctly
+ - removed tests for CSROT and ZDROT that relied on unspecified behavior
+ - fixed a performance regression in multithreaded GEMM that was particularly
+   serious on POWER targets
+ - fixed linking issues when using LLVM's flang-new with gmake
+ - fixed a potential thread safety problem with C11 atomic operations
+ - further improved the workload partitioning in parallel GEMM
+ - fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in 
+   CMake builds
+ - fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies
+   for LAPACK function SPMV in CMake builds
+ - added explicit CMake options for building LAPACKE and shared libraries
+ - simplified and improved handling of OpenMP options in CMake builds
+ - reworked Windows DLL generation in CMake builds to ensure correct symbol
+   renaming (pre/postfixing) and optional generation of PDB files for debugging
+ - updated the Perl script version of the gensymbol utility for use with 
+   Windows-on-Arm
+ - Fixed building with (Mingw) gmake on Windows to ensure completeness of the
+   LAPACK included in the static library (potential race condition due to the
+   Windows version of the "ln" utility creating snapshot copies rather than links)
+ - fixed unwanted deletion of the lapacke_mangling.h file by "make clean"
+ - fixed potential duplication of a _64 suffix on library names in CMake builds
+ - fixed compilation of the C fallback copies of the LAPACK code with GCC 15
+ - included fixed from the Reference-LAPACK project:
+   - fixed a truncated error message in the EIG part of the testsuite
+     (Reference-LAPACK PR 1119)
+   - fixed too strict check in LAPACKE_?gesdd_work (PR #1126)
+   - fixed memory corruption when calling ?GEEV with non-finite data (PR #1128)
+   - fixed missing initialization of a variable in C/GEQP3RK (PR #1131)
+   - fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135)
+
+x86_64:
+ - fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - improved the compiler identification code for flang-new
+ - fixed a potential build issue in the ZSUM kernel
+ - fixed "argument list too long" errors when building on MacOS
+ - added cpu autodetection support for several new Arrow Lake models
+ - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
+ - fixed compilation with the MinGW build of GCC 15
+
+arm64:
+ - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29)
+ - added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds
+ - added an optimized SBGEMM kernel for NEOVERSEV1
+ - improved 1xN SBGEMM performance by forwarding to SBGEMV 
+ - introduced a stepwise increase of the thread count used for
+   SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size
+ - introduced a stepwise increase of the thread count used for
+   DGEMV on NEOVERSEV1 in relation to problem size
+ - introduced a stepwise increase of the thread count used for
+   SDOT and DDOT on NEOVERSEV1 in relation to problem size
+ - worked around assembler limitations in LLVM for Windows-on-Arm
+ - enabled cpu type autodetection from the registry on Windows-on-Arm
+ - improved multithreading threshold for GEMV and GESV on Windows-on-Arm
+ - fixed overoptimization issues with LLVM's flang in Windows-on-Arm
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - added a fast path SGEMM kernel for small workloads on SME capable targets
+ - improved performance of SGEMM and DGEMM kernels for small workloads
+ - improved performance of SGEMV and DGEMV on SVE-capable targets
+ - improved performance of SGEMV on NEOVERSEN1 and Apple M
+ - added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all
+   SVE capable targets
+ - added optimized SBGEMV kernels for NEOVERSEV1/V2/N2
+ - improved performance of SGEMM through faster NCOPY kernels
+ - added compiler options for the NVIDIA HPC Compiler Suite
+ - fixed compilation on OSX with XCode 16.3 and later
+ - fixed cpu core type and cache size detection on Apple M4
+ - updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake
+ - fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds
+ - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
+ - fixed potential miscompilation of the non-SVE SDOT kernel
+
+riscv64:
+ - added optimized SROTM and DROTM kernels for x280
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - improved performance of GEMM_TCOPY on RVV1.0 targets with 
+   VLEN of 128 or 256
+ - improved performance of OMATCOPY on targets with VLEN 256
+ - greatly improved performance of SGEMV/DGEMV
+ - improved performance of CGEMV and ZGEMV on C910V and all RVV targets 
+   with VLEN 256
+ - improved performance of SAXPBY and DAXPBY on C910V and all RVV targets 
+   with VLEN 256
+ - improved performance of AXPY and DOT on C910V and ZVL256B targets by
+   falling back to non-vectorized code for very small N. (Thereby fixing
+   poor performance of CHBMV/ZHBMV for very small K)
+ - fixed CMake build failures of the TRMM kernels 
+
+loongarch64:
+ - improved performance of the LSX versions of SSYMV/DSYMV
+ - made the LASX versions of the DSYMV and SSYMV kernels 
+   compatible with hardware changes in LA664 and future targets
+ - fixed inaccuracies in several LASX kernels
+ - improved compatibility of LSX kernels with LA264 targets
+ - fixed handling of deprecated target names in CMake builds
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+power:
+ - fixed building for PPCG4 with CMake
+ - fixed SSCAL/DSCAL on PPC970 running FreeBSD
+ - fixed a potential alignment issue in the POWER8 SGEMV kernel
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+zarch:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - fixed unwanted generation of object files with a writable stack
+
+x86:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - worked around potential miscompilation of CDOT with very old binutils
+
+arm:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - fixed unwanted generation of object files with a writable stack
+
+sparc:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+alpha:
+ - fixed build failure caused by spurious Windows-only typecasts
+
+cell:
+ - fixed probable build issue caused by spurious Windows-only typecasts
+ 
 ====================================================================
 Version 0.3.29
 12-Jan-2025
 
@@ -191,6 +191,16 @@ endif
 endif
 endif
 
+# Detect Ampere AmpereOne(ampere1,ampere1a) processors.
+ifeq ($(CORE), AMPERE1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
+endif
+endif
+endif
+
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
 ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 else ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
 else
 $(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
 endif
 else
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 endif
 ifeq ($(F_COMPILER), IBM)
 FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif
 
 ifeq ($(CORE), POWER9)
 ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mvsx -fno-fast-math
 ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -70,7 +70,7 @@ endif
 
 ifeq ($(CORE), POWER8)
 ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
 else
 CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 
@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL256B)
-TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL128B)
-TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_GENERIC)
 
@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
-CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_ZVL128B)
-CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
+CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.29.dev
+VERSION = 0.3.30.dev
 
 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -308,6 +308,8 @@ COMMON_PROF = -pg
 # If you want to enable the experimental BFLOAT16 support
 # BUILD_BFLOAT16 = 1
 
+# If you want to enable the experimental HFLOAT16 support
+# BUILD_HFLOAT16 = 1
 
 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
 # will be allocated on the heap rather than the stack. (This array alone requires 
 
@@ -393,6 +393,8 @@ GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
 GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
 GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
 GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
+GCCVERSIONGTEQ13 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 13)
+GCCVERSIONGTEQ14 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 14)
 # Note that the behavior of -dumpversion is compile-time-configurable for
 # gcc-7.x and newer. Use -dumpfullversion there
 ifeq ($(GCCVERSIONGTEQ7),1)
@@ -1191,6 +1193,13 @@ endif
 else ifeq ($(ARCH), $(filter $(ARCH),mips))
 FCOMMON_OPT += -mabi=32
 endif
+ifeq ($(ARCH), $(filter $(ARCH),loongarch64))
+ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
+FCOMMON_OPT +=  -fdefault-integer-8
+endif
+endif
+endif
 else
 ifdef BINARY64
 ifneq ($(OSNAME), AIX)
@@ -1547,6 +1556,9 @@ endif
 ifeq ($(BUILD_BFLOAT16), 1)
 CCOMMON_OPT += -DBUILD_BFLOAT16
 endif
+ifeq ($(BUILD_HFLOAT16), 1)
+CCOMMON_OPT += -DBUILD_HFLOAT16
+endif
 ifeq ($(BUILD_SINGLE), 1)
 CCOMMON_OPT += -DBUILD_SINGLE=1
 endif
@@ -1889,11 +1901,14 @@ export TARGET_CORE
 export NO_AVX512
 export NO_AVX2
 export BUILD_BFLOAT16
+export BUILD_HFLOAT16
 export NO_LSX
 export NO_LASX
 
 export SBGEMM_UNROLL_M
 export SBGEMM_UNROLL_N
+export SHGEMM_UNROLL_M
+export SHGEMM_UNROLL_N
 export SGEMM_UNROLL_M
 export SGEMM_UNROLL_N
 export DGEMM_UNROLL_M
 
@@ -1,4 +1,5 @@
 SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
+SHBLASPBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
 SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
 DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
 QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
@@ -11,8 +12,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))
 
 HPLOBJS_P   = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))
 
-BLASOBJS    = $(SBEXTOBJS) $(SBBLASOBJS)  $(SBLASOBJS)   $(DBLASOBJS)   $(CBLASOBJS)   $(ZBLASOBJS) $(CBAUXOBJS)
-BLASOBJS_P  = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)
+BLASOBJS    = $(SHBLASOBJS) $(SBEXTOBJS) $(SBBLASOBJS)  $(SBLASOBJS)   $(DBLASOBJS)   $(CBLASOBJS)   $(ZBLASOBJS) $(CBAUXOBJS)
+BLASOBJS_P  = $(SHBLASPBJS_P) $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)
 
 ifdef EXPRECISION
 BLASOBJS   += $(QBLASOBJS)   $(XBLASOBJS)
@@ -24,6 +25,7 @@ BLASOBJS   += $(QBLASOBJS)   $(XBLASOBJS)
 BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
 endif
 
+$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHFLOAT16 -UDOUBLE  -UCOMPLEX
 $(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE  -UCOMPLEX
 $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE  -UCOMPLEX
 $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE  -UCOMPLEX
@@ -33,6 +35,7 @@ $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE  -DCOMPLEX
 $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
 $(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE  -UCOMPLEX
 
+$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
 $(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
 $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
 $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)