1. optimize CS_Matrix_Tools::cal_uplimit()

PeizeLin · PeizeLin · commit 02f2e1fe9c98 · 2022-12-10T14:56:37.000+08:00
2. update Blas_Interface::matcopy()
diff --git a/include/RI/global/Blas_Interface-Contiguous.h b/include/RI/global/Blas_Interface-Contiguous.h
@@ -100,4 +100,58 @@ namespace Blas_Interface
 	}
 }
 
+
+
+#ifdef __MKL_RI
+
+namespace Blas_Interface
+{
+	inline size_t get_lda_matcopy(const char ordering, size_t rows, size_t cols)
+	{
+		switch(std::toupper(ordering))
+		{
+			case 'R':	return cols;
+			case 'C':	return rows;
+			default:	throw std::invalid_argument("ordering cannot be "+std::to_string(ordering)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+		}
+	}
+	inline size_t get_ldb_matcopy(const char ordering, const char trans, size_t rows, size_t cols)
+	{
+		switch(std::toupper(ordering))
+		{
+			case 'R':
+				switch(std::toupper(trans))
+				{
+					case 'N':	case 'R':	return cols;
+					case 'T':	case 'C':	return rows;
+					default:	throw std::invalid_argument("trans cannot be "+std::to_string(trans)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+				}
+			case 'C':
+				switch(std::toupper(trans))
+				{
+					case 'N':	case 'R':	return rows;
+					case 'T':	case 'C':	return cols;
+					default:	throw std::invalid_argument("trans cannot be "+std::to_string(trans)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+				}
+			default:	throw std::invalid_argument("ordering cannot be "+std::to_string(ordering)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+		}		
+	}
+	template<typename T>
+	inline void imatcopy (const char ordering, const char trans, size_t rows, size_t cols, const T alpha, T * AB)
+	{
+		const size_t lda = get_lda_matcopy(ordering, rows, cols);
+		const size_t ldb = get_ldb_matcopy(ordering, trans, rows, cols);
+		imatcopy (ordering, trans, rows, cols, alpha, AB, lda, ldb);
+	}
+	template<typename T>
+	inline void omatcopy (char ordering, char trans, size_t rows, size_t cols, const T alpha, const T * A, T * B)
+	{
+		const size_t lda = get_lda_matcopy(ordering, rows, cols);
+		const size_t ldb = get_ldb_matcopy(ordering, trans, rows, cols);
+		omatcopy (ordering, trans, rows, cols, alpha, A, lda, B, ldb);
+	}
+}
+
+#endif
+
 }
diff --git a/include/RI/global/Blas_Interface-Tensor.h b/include/RI/global/Blas_Interface-Tensor.h
@@ -16,8 +16,8 @@ namespace RI
 namespace Blas_Interface
 {
 	// nrm2 = ||x||_2
-	template<typename T>
-	inline Global_Func::To_Real_t<T> nrm2(const Tensor<T> &X)
+	template<typename T, template<typename> class Tvec>
+	inline Global_Func::To_Real_t<T> nrm2(const Tvec<T> &X)
 	{
 		return nrm2(X.get_shape_all(), X.ptr());
 	}
@@ -187,6 +187,41 @@ namespace Blas_Interface
 	}
 }
 
+
+#ifdef __MKL_RI
+
+namespace Blas_Interface
+{
+	template<typename T>
+	inline void imatcopy (const char trans, const T alpha, Tensor<T> &AB)
+	{
+		assert(AB.shape.size()==2);
+		imatcopy ('R', trans, AB.shape[0], AB.shape[1], alpha, AB.ptr());
+		switch(std::toupper(trans))
+		{
+			case 'N':	case 'R':	break;
+			case 'T':	case 'C':	AB=AB.reshape({AB.shape[1], AB.shape[0]});	break;
+			default:	throw std::invalid_argument("trans cannot be "+std::to_string(trans)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+		}
+	}
+	template<typename T>
+	inline Tensor<T> omatcopy (char trans, const T alpha, const Tensor<T> &A)
+	{
+		assert(A.shape.size()==2);
+		Tensor<T> B;
+		switch(std::toupper(trans))
+		{
+			case 'N':	case 'R':	B = Tensor<T>({A.shape[0], A.shape[1]});	break;
+			case 'T':	case 'C':	B = Tensor<T>({A.shape[1], A.shape[0]});	break;
+			default:	throw std::invalid_argument("trans cannot be "+std::to_string(trans)+". "+std::string(__FILE__)+" line "+std::to_string(__LINE__));
+		}
+		omatcopy ('R', trans, A.shape[0], A.shape[1], alpha, A.ptr(), B.ptr());
+		return B;
+	}
+}
+
+#endif
+
 }
 
 #include "Tensor.hpp"
diff --git a/include/RI/global/Blas_Interface.h b/include/RI/global/Blas_Interface.h
@@ -10,6 +10,11 @@
 #include <string>
 #include <stdexcept>
 
+
+#ifdef __MKL_RI
+#include <mkl_trans.h>
+#endif
+
 namespace RI
 {
 
@@ -211,4 +216,47 @@ namespace Blas_Interface
 	}
 }
 
+
+
+#ifdef __MKL_RI
+
+namespace Blas_Interface
+{
+	inline void imatcopy (const char ordering, const char trans, size_t rows, size_t cols, const float alpha, float * AB, size_t lda, size_t ldb)
+	{
+		mkl_simatcopy (ordering, trans, rows, cols, alpha, AB, lda, ldb);
+	}
+	inline void imatcopy (const char ordering, const char trans, size_t rows, size_t cols, const double alpha, double * AB, size_t lda, size_t ldb)
+	{
+		mkl_dimatcopy (ordering, trans, rows, cols, alpha, AB, lda, ldb);
+	}
+	inline void imatcopy (const char ordering, const char trans, size_t rows, size_t cols, const std::complex<float> alpha, std::complex<float> * AB, size_t lda, size_t ldb)
+	{
+		mkl_cimatcopy (ordering, trans, rows, cols, alpha, AB, lda, ldb);
+	}
+	inline void imatcopy (const char ordering, const char trans, size_t rows, size_t cols, const std::complex<double> alpha, std::complex<double> * AB, size_t lda, size_t ldb)
+	{
+		mkl_zimatcopy (ordering, trans, rows, cols, alpha, AB, lda, ldb);
+	}
+
+	inline void omatcopy (char ordering, char trans, size_t rows, size_t cols, const float alpha, const float * A, size_t lda, float * B, size_t ldb)
+	{
+		mkl_somatcopy (ordering, trans, rows, cols, alpha, A, lda, B, ldb);
+	}
+	inline void omatcopy (char ordering, char trans, size_t rows, size_t cols, const double alpha, const double * A, size_t lda, double * B, size_t ldb)
+	{
+		mkl_domatcopy (ordering, trans, rows, cols, alpha, A, lda, B, ldb);
+	}
+	inline void omatcopy (char ordering, char trans, size_t rows, size_t cols, const std::complex<float> alpha, const std::complex<float> * A, size_t lda, std::complex<float> * B, size_t ldb)
+	{
+		mkl_comatcopy (ordering, trans, rows, cols, alpha, A, lda, B, ldb);
+	}
+	inline void omatcopy (char ordering, char trans, size_t rows, size_t cols, const std::complex<double> alpha, const std::complex<double> * A, size_t lda, std::complex<double> * B, size_t ldb)
+	{
+		mkl_zomatcopy (ordering, trans, rows, cols, alpha, A, lda, B, ldb);
+	}
+}
+
+#endif
+
 }
diff --git a/include/RI/global/Tensor_Wrapper.h b/include/RI/global/Tensor_Wrapper.h
@@ -0,0 +1,40 @@
+// ===================
+//  Author: Peize Lin
+//  date: 2022.12.09
+// ===================
+
+#pragma once
+
+#include "Global_Func-2.h"
+#include <vector>
+
+
+#include <numeric>
+
+// Attention: very dangerous
+
+namespace RI
+{
+
+template<typename T>
+class Tensor_Wrapper
+{
+public:
+
+	std::vector<std::size_t> shape;
+	T *ptr_ = nullptr;
+
+	Tensor_Wrapper()=default;
+	explicit inline Tensor_Wrapper (const std::vector<std::size_t> &shape_in, T*const ptr_in) :shape(shape_in), ptr_(ptr_in){}
+
+	T* ptr()const{ return this->ptr_; }
+	inline std::size_t get_shape_all() const;
+
+	// ||d||_p = (|d_1|^p+|d_2|^p+...)^{1/p}
+	// if(p==std::numeric_limits<double>::max())    ||d||_max = max_i |d_i|
+	Global_Func::To_Real_t<T> norm(const double p) const;
+};
+
+}
+
+#include "Tensor_Wrapper.hpp"
diff --git a/include/RI/global/Tensor_Wrapper.hpp b/include/RI/global/Tensor_Wrapper.hpp
@@ -0,0 +1,47 @@
+// ===================
+//  Author: Peize Lin
+//  date: 2022.12.09
+// ===================
+
+#pragma once
+
+#include "Tensor_Wrapper.h"
+
+namespace RI
+{
+	template<typename T>
+	std::size_t Tensor_Wrapper<T>::get_shape_all() const
+	{
+		return std::accumulate(this->shape.begin(), this->shape.end(), static_cast<std::size_t>(1), std::multiplies<std::size_t>() );
+	}
+
+	template<typename T>
+	Global_Func::To_Real_t<T> Tensor_Wrapper<T>::norm(const double p) const
+	{
+		using T_res = Global_Func::To_Real_t<T>;
+		const std::size_t shape_all = get_shape_all();
+		if(p==2)
+			return Blas_Interface::nrm2(*this);
+		else if(p==1)
+		{
+			T_res s = 0;
+			for(std::size_t i=0; i<shape_all; ++i)
+				s += std::abs(this->ptr_[i]);
+			return s;
+		}
+		else if(p==std::numeric_limits<double>::max())
+		{
+			T_res s = 0;
+			for(std::size_t i=0; i<shape_all; ++i)
+				s = std::max(std::real(s), std::abs(this->ptr_[i]));
+			return s;
+		}
+		else
+		{
+			T_res s = 0;
+			for(std::size_t i=0; i<shape_all; ++i)
+				s += std::pow(std::abs(this->ptr_[i]), p);
+			return std::pow(s,1.0/p);
+		}
+	}
+}
diff --git a/include/RI/ri/CS_Matrix_Tools.hpp b/include/RI/ri/CS_Matrix_Tools.hpp
@@ -7,6 +7,7 @@
 
 #include "CS_Matrix_Tools.h"
 #include "../global/Blas_Interface-Tensor.h"
+#include "../global/Tensor_Wrapper.h"
 #include <stdexcept>
 #include <memory.h>
 
@@ -104,6 +105,36 @@ namespace CS_Matrix_Tools
 			return uplimits.max();
 		};
 
+		auto three_2_norm = [&D]() -> Tlim
+		{
+#ifdef __MKL_RI
+			const Tensor<Tdata> Ds_sub = Blas_Interface::omatcopy(
+				'T', Tdata{1.0},
+				D.reshape({D.shape[0]*D.shape[1],D.shape[2]}));
+#else
+			Tensor<Tdata> Ds_sub({D.shape[2], D.shape[0]*D.shape[1]});
+
+			std::vector<Tdata*> Ds_sub_ptr(D.shape[2]);
+			for(std::size_t i2=0; i2<D.shape[2]; ++i2)
+				Ds_sub_ptr[i2] = Ds_sub.ptr()+i2*Ds_sub.shape[1]-1;
+
+			const Tdata* D_ptr = D.ptr()-1;
+			const std::size_t size2 = D.shape[2];
+			for(std::size_t i01=0; i01<Ds_sub.shape[1]; ++i01)
+				for(std::size_t i2=0; i2<size2; ++i2)
+					*(++Ds_sub_ptr[i2]) = *(++D_ptr);
+#endif
+
+			Tensor_Wrapper<Tdata> D_sub({D.shape[0],D.shape[1]}, nullptr);
+			std::valarray<Global_Func::To_Real_t<Tdata>> uplimits(D.shape[2]);
+			for(std::size_t i2=0; i2<D.shape[2]; ++i2)
+			{
+				D_sub.ptr_ = Ds_sub.ptr()+i2*Ds_sub.shape[1];
+				uplimits[i2] = D_sub.norm(2);
+			}
+			return uplimits.max();
+		};	
+
 		auto norm = [](const Tensor<Tdata> &D) -> Tlim
 		{
 			return D.norm(2);
@@ -123,7 +154,8 @@ namespace CS_Matrix_Tools
 			case Uplimit_Type::norm_three_1:
 				return three_1(norm);
 			case Uplimit_Type::norm_three_2:
-				return three_2(norm);
+//				return three_2(norm);
+				return three_2_norm();
 			case Uplimit_Type::square_two:
 				return square(D);
 			case Uplimit_Type::square_three_0:
diff --git a/include/RI/ri/LRI-cal.hpp b/include/RI/ri/LRI-cal.hpp
@@ -10,7 +10,7 @@
 #include "../global/Array_Operator.h"
 
 #include <omp.h>
-#ifdef __MKL
+#ifdef __MKL_RI
 #include <mkl_service.h>
 #endif
 
@@ -41,7 +41,7 @@ void LRI<TA,Tcell,Ndim,Tdata>::cal(
 	omp_lock_t lock_Ds_result_add;
 	omp_init_lock(&lock_Ds_result_add);
 
-#ifdef __MKL
+#ifdef __MKL_RI
 	const std::size_t mkl_threads = mkl_get_max_threads();
 //	if(!omp_get_nested())
 //		mkl_set_num_threads(std::max(1UL,mkl_threads/list_Aa01.size()));
@@ -120,7 +120,7 @@ void LRI<TA,Tcell,Ndim,Tdata>::cal(
 	} // end #pragma omp parallel
 
 	omp_destroy_lock(&lock_Ds_result_add);
-#ifdef __MKL
+#ifdef __MKL_RI
 	mkl_set_num_threads(mkl_threads);
 #endif
 }
diff --git a/unittests/global/Blas-test.hpp b/unittests/global/Blas-test.hpp